Natural Language Processing (NLP) enables computers to understand and generate human language. TensorFlow provides tools from raw text to trained models.

Setup

  pip install tensorflow
  

Text Preprocessing

  import tensorflow as tf

# Simple vocabulary-based encoding
vocabulary = ["the", "cat", "sat", "on", "mat", "dog", "ran"]
word_to_idx = {word: i+1 for i, word in enumerate(vocabulary)}  # 0 = padding

def encode(text, word_to_idx, max_len=10):
    words = text.lower().split()
    indices = [word_to_idx.get(w, 0) for w in words]
    # Pad or truncate to max_len
    if len(indices) < max_len:
        indices += [0] * (max_len - len(indices))
    return indices[:max_len]

print(encode("the cat sat on mat", word_to_idx))
# [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]
  

Built-in preprocessing as a Keras layer:

  from tensorflow.keras.layers import TextVectorization

train_texts = [
    "I love this movie",
    "Terrible film, waste of time",
    "Absolutely fantastic performance",
    "Boring and predictable",
    "Best movie I've ever seen",
    "Awful acting and plot",
]
train_labels = [1, 0, 1, 0, 1, 0]  # 1=positive, 0=negative

vectorizer = TextVectorization(max_tokens=1000, output_sequence_length=50)
vectorizer.adapt(train_texts)

sample = vectorizer(["I love this movie"])
print(sample)  # tensor of word indices
print(vectorizer.get_vocabulary()[:10])
  

Embedding Layer

Convert word indices to dense vectors:

  from tensorflow import keras
from tensorflow.keras import layers

vocab_size = len(vectorizer.get_vocabulary())
embedding_dim = 32

model = keras.Sequential([
    vectorizer,
    layers.Embedding(vocab_size, embedding_dim),
    layers.GlobalAveragePooling1D(),
    layers.Dense(16, activation="relu"),
    layers.Dense(1, activation="sigmoid"),
])

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"],
)
model.summary()
  

Train Sentiment Classifier

  import numpy as np

train_labels = np.array(train_labels)
train_ds = tf.data.Dataset.from_tensor_slices((train_texts, train_labels)).batch(2)

history = model.fit(train_ds, epochs=50, verbose=0)
print(f"Final accuracy: {history.history['accuracy'][-1]:.2f}")

# Predict
predictions = model.predict(["I really enjoyed this film"])
print(f"Positive probability: {predictions[0][0]:.2f}")
  

LSTM for Sequence Modeling

LSTMs capture word order — better than bag-of-words for many tasks:

  model_lstm = keras.Sequential([
    vectorizer,
    layers.Embedding(vocab_size, 64, mask_zero=True),
    layers.LSTM(64, return_sequences=False),
    layers.Dropout(0.5),
    layers.Dense(1, activation="sigmoid"),
])

model_lstm.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"],
)

model_lstm.fit(train_ds, epochs=30, verbose=0)
  

Using Real Datasets

  import tensorflow_datasets as tfds

# IMDB movie reviews — 50,000 labeled reviews
(train_ds, test_ds), info = tfds.load(
    "imdb_reviews",
    split=["train", "test"],
    as_supervised=True,
    with_info=True,
)

for text, label in train_ds.take(1):
    print(text.numpy().decode()[:200])
    print(f"Label: {label.numpy()}")  # 0=neg, 1=pos
  

Save and Load

  model.save("sentiment_model.keras")

loaded = keras.models.load_model("sentiment_model.keras")
loaded.predict(["Great product, highly recommend!"])
  

NLP Task Overview

Task Approach TensorFlow Tool
Sentiment analysis Classification Embedding + Dense/LSTM
Text generation Sequence prediction LSTM / Transformer
Translation Seq2seq Encoder-Decoder
Named entity recognition Token classification Bidirectional LSTM
Question answering Span extraction BERT (via TF Hub)

Pre-trained Models with TF Hub

  import tensorflow_hub as hub

# Use a pre-trained BERT model
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert_en/1")

text_input = keras.Input(shape=(), dtype=tf.string)
preprocessed = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed)
# Build classifier on top of outputs["pooled_output"]
  

For most NLP tasks today, start with Hugging Face for pre-trained models. Use TensorFlow when you need custom architectures or TF Serving deployment.