ValueError: Can't convert non-rectangular Python sequence to Tensor in text-classification problem

Question

I am building a text classification system which requires a large preprocessing and training script. The script reads variable-length token sequences and attempts to build a tf.data.Dataset using from_tensor_slices. It then performs several dataset transformations, builds a simple Keras model, and calls model.fit().

However, when I run it, I immediately get the following error:

ValueError: Can't convert non-rectangular Python sequence to Tensor

For reference, I used ideas from this example notebook.

def make_random_sequence(min_len=1, max_len=20):
    length = random.randint(min_len, max_len)
    # tokens in [1, VOCAB_SIZE), 0 reserved for padding
    return [random.randint(1, VOCAB_SIZE - 1) for _ in range(length)]

sequences = [make_random_sequence(min_len=1, max_len=30) for _ in range(NUM_SAMPLES)]
labels = [random.randint(0, 1) for _ in range(NUM_SAMPLES)]
 
ds = tf.data.Dataset.from_tensor_slices((sequences, labels)) 

def _preprocess(x, y):
    x = tf.cast(x, tf.int32)  
    y = tf.cast(y, tf.int32)
    seq_len = tf.shape(x)[0]
    return {"tokens": x, "length": seq_len}, y

ds = ds.shuffle(buffer_size=NUM_SAMPLES)
ds = ds.map(_preprocess, num_parallel_calls=tf.data.AUTOTUNE)
ds = ds.padded_batch(batch_size=8, padded_shapes=({"tokens": [None], "length": []}, []))
ds = ds.prefetch(tf.data.AUTOTUNE)

token_input = tf.keras.Input(shape=(None,), dtype=tf.int32, name="tokens")
length_input = tf.keras.Input(shape=(), dtype=tf.int32, name="length")

emb = tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, output_dim=64, mask_zero=True)(token_input)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32))(emb)
x = tf.keras.layers.Concatenate()([x, tf.keras.layers.Dense(16)(tf.cast(length_input, tf.float32))])
out = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model = tf.keras.Model(inputs=[token_input, length_input], outputs=out)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

Sagar · Accepted Answer · 2025-11-27 06:33:04Z

The ValueError was raised because you provided a Python list containing sequences of differing lengths, whereas tf.data.Dataset requires uniformly shaped inputs. You can fix this by using tf.RaggedTensor, which follow the non-rectangular sequences of the inputs by maintaining their shape prior to being sliced, which in turn keeps it compatible with the tf.data pipeline. You can refer the similar issue here.

Code:

def make_random_sequence(min_len=1, max_len=20):
    length = random.randint(min_len, max_len)
    return [random.randint(1, VOCAB_SIZE - 1) for _ in range(length)]

sequences = [make_random_sequence(min_len=1, max_len=30) for _ in range(NUM_SAMPLES)]
labels = [random.randint(0, 1) for _ in range(NUM_SAMPLES)]

ragged_sequences = tf.ragged.constant(sequences)
ds = tf.data.Dataset.from_tensor_slices((ragged_sequences, labels))

def _preprocess(x, y):
    x = tf.cast(x, tf.int32)
    y = tf.cast(y, tf.int32)
    seq_len = tf.shape(x)[0]
    return {"tokens": x, "length": seq_len}, y

ds = ds.shuffle(buffer_size=NUM_SAMPLES)
ds = ds.map(_preprocess, num_parallel_calls=tf.data.AUTOTUNE)
ds = ds.padded_batch(batch_size=8, padded_shapes=({"tokens": [None], "length": []}, []))
ds = ds.prefetch(tf.data.AUTOTUNE)

token_input = tf.keras.Input(shape=(None,), dtype=tf.int32, name="tokens")
length_input = tf.keras.Input(shape=(), dtype=tf.int32, name="length")

emb = tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, output_dim=64, mask_zero=True)(token_input)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32))(emb)
length_float = tf.keras.layers.Lambda(lambda z: tf.expand_dims(tf.cast(z, tf.float32), -1))(length_input)
x = tf.keras.layers.Concatenate()([x, tf.keras.layers.Dense(16)(length_float)])
out = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model = tf.keras.Model(inputs=[token_input, length_input], outputs=out)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.fit(ds, epochs=3)

Collectives™ on Stack Overflow

ValueError: Can't convert non-rectangular Python sequence to Tensor in text-classification problem

1 Answer 1

Comments

Your Answer

Linked

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

Comments

Your Answer

Sign up or log in

Post as a guest

Linked

Related