I am building a text classification system which requires a large preprocessing and training script. The script reads variable-length token sequences and attempts to build a tf.data.Dataset using from_tensor_slices. It then performs several dataset transformations, builds a simple Keras model, and calls model.fit().
However, when I run it, I immediately get the following error:
ValueError: Can't convert non-rectangular Python sequence to Tensor
For reference, I used ideas from this example notebook.
def make_random_sequence(min_len=1, max_len=20):
length = random.randint(min_len, max_len)
# tokens in [1, VOCAB_SIZE), 0 reserved for padding
return [random.randint(1, VOCAB_SIZE - 1) for _ in range(length)]
sequences = [make_random_sequence(min_len=1, max_len=30) for _ in range(NUM_SAMPLES)]
labels = [random.randint(0, 1) for _ in range(NUM_SAMPLES)]
ds = tf.data.Dataset.from_tensor_slices((sequences, labels))
def _preprocess(x, y):
x = tf.cast(x, tf.int32)
y = tf.cast(y, tf.int32)
seq_len = tf.shape(x)[0]
return {"tokens": x, "length": seq_len}, y
ds = ds.shuffle(buffer_size=NUM_SAMPLES)
ds = ds.map(_preprocess, num_parallel_calls=tf.data.AUTOTUNE)
ds = ds.padded_batch(batch_size=8, padded_shapes=({"tokens": [None], "length": []}, []))
ds = ds.prefetch(tf.data.AUTOTUNE)
token_input = tf.keras.Input(shape=(None,), dtype=tf.int32, name="tokens")
length_input = tf.keras.Input(shape=(), dtype=tf.int32, name="length")
emb = tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, output_dim=64, mask_zero=True)(token_input)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32))(emb)
x = tf.keras.layers.Concatenate()([x, tf.keras.layers.Dense(16)(tf.cast(length_input, tf.float32))])
out = tf.keras.layers.Dense(1, activation="sigmoid")(x)
model = tf.keras.Model(inputs=[token_input, length_input], outputs=out)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])