1

I am building a text classification system which requires a large preprocessing and training script. The script reads variable-length token sequences and attempts to build a tf.data.Dataset using from_tensor_slices. It then performs several dataset transformations, builds a simple Keras model, and calls model.fit().

However, when I run it, I immediately get the following error:

ValueError: Can't convert non-rectangular Python sequence to Tensor

For reference, I used ideas from this example notebook.

def make_random_sequence(min_len=1, max_len=20):
    length = random.randint(min_len, max_len)
    # tokens in [1, VOCAB_SIZE), 0 reserved for padding
    return [random.randint(1, VOCAB_SIZE - 1) for _ in range(length)]

sequences = [make_random_sequence(min_len=1, max_len=30) for _ in range(NUM_SAMPLES)]
labels = [random.randint(0, 1) for _ in range(NUM_SAMPLES)]
 
ds = tf.data.Dataset.from_tensor_slices((sequences, labels)) 

def _preprocess(x, y):
    x = tf.cast(x, tf.int32)  
    y = tf.cast(y, tf.int32)
    seq_len = tf.shape(x)[0]
    return {"tokens": x, "length": seq_len}, y

ds = ds.shuffle(buffer_size=NUM_SAMPLES)
ds = ds.map(_preprocess, num_parallel_calls=tf.data.AUTOTUNE)
ds = ds.padded_batch(batch_size=8, padded_shapes=({"tokens": [None], "length": []}, []))
ds = ds.prefetch(tf.data.AUTOTUNE)

token_input = tf.keras.Input(shape=(None,), dtype=tf.int32, name="tokens")
length_input = tf.keras.Input(shape=(), dtype=tf.int32, name="length")

emb = tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, output_dim=64, mask_zero=True)(token_input)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32))(emb)
x = tf.keras.layers.Concatenate()([x, tf.keras.layers.Dense(16)(tf.cast(length_input, tf.float32))])
out = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model = tf.keras.Model(inputs=[token_input, length_input], outputs=out)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

1 Answer 1

0

The ValueError was raised because you provided a Python list containing sequences of differing lengths, whereas tf.data.Dataset requires uniformly shaped inputs. You can fix this by using tf.RaggedTensor, which follow the non-rectangular sequences of the inputs by maintaining their shape prior to being sliced, which in turn keeps it compatible with the tf.data pipeline. You can refer the similar issue here.

Code:

def make_random_sequence(min_len=1, max_len=20):
    length = random.randint(min_len, max_len)
    return [random.randint(1, VOCAB_SIZE - 1) for _ in range(length)]

sequences = [make_random_sequence(min_len=1, max_len=30) for _ in range(NUM_SAMPLES)]
labels = [random.randint(0, 1) for _ in range(NUM_SAMPLES)]

ragged_sequences = tf.ragged.constant(sequences)
ds = tf.data.Dataset.from_tensor_slices((ragged_sequences, labels))

def _preprocess(x, y):
    x = tf.cast(x, tf.int32)
    y = tf.cast(y, tf.int32)
    seq_len = tf.shape(x)[0]
    return {"tokens": x, "length": seq_len}, y

ds = ds.shuffle(buffer_size=NUM_SAMPLES)
ds = ds.map(_preprocess, num_parallel_calls=tf.data.AUTOTUNE)
ds = ds.padded_batch(batch_size=8, padded_shapes=({"tokens": [None], "length": []}, []))
ds = ds.prefetch(tf.data.AUTOTUNE)

token_input = tf.keras.Input(shape=(None,), dtype=tf.int32, name="tokens")
length_input = tf.keras.Input(shape=(), dtype=tf.int32, name="length")

emb = tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, output_dim=64, mask_zero=True)(token_input)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32))(emb)
length_float = tf.keras.layers.Lambda(lambda z: tf.expand_dims(tf.cast(z, tf.float32), -1))(length_input)
x = tf.keras.layers.Concatenate()([x, tf.keras.layers.Dense(16)(length_float)])
out = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model = tf.keras.Model(inputs=[token_input, length_input], outputs=out)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.fit(ds, epochs=3)
Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.