Dataset structureCan somebody help me with the NER model in converting the data into spacy format. The dataset format is shown in the screenshot here (https://www.kaggle.com/datasets/naseralqaydeh/named-entity-recognition-ner-corpus)
Though i build but the model is not giving any output during test.
#Convert data to spaCy format
def convert_to_spacy_format(data):
nlp = spacy.blank("en") # Creating blank English language model
db = DocBin() # document bin object
for _, row in tqdm(data.iterrows(), total=len(data)):
sentence = row["CleanSentence"]
pos_tags = row["POS"]
ner_tags = row["Tag"]
# Create a doc object
doc = nlp.make_doc(sentence)
# Split the sentence into words (tokens)
words = sentence.split()
# Check if lengths match
if len(words) != len(ner_tags) or len(words) != len(pos_tags):
print(f"Warning: Length mismatch: Words: {len(words)}, NER tags: {len(ner_tags)}, POS tags: {len(pos_tags)}")
continue
ents = []
current_ent = None
current_ent_start = None
# Process each token
for idx, (token, tag) in enumerate(zip(doc, ner_tags)):
# If it's the beginning of an entity
if tag.startswith("B-"):
# If we were tracking an entity, add it to our list
if current_ent is not None:
ents.append((current_ent_start, token.idx + len(token), current_ent))
# Start tracking a new entity
current_ent = tag[2:] # Remove "B-" prefix
current_ent_start = token.idx
# If it's inside an entity
elif tag.startswith("I-"):
# Continue tracking the current entity
pass
# If it's outside any entity
elif tag == "O":
# If we were tracking an entity, add it to the list
if current_ent is not None:
ents.append((current_ent_start, token.idx, current_ent))
current_ent = None
current_ent_start = None
# Add the last entity if we were tracking one
if current_ent is not None:
ents.append((current_ent_start, len(sentence), current_ent))
# Create spans for each entity
spans = []
for start, end, label in ents:
span = doc.char_span(start, end, label=label)
if span is not None:
spans.append(span)
# Filter overlapping spans
filtered_spans = filter_spans(spans)
# Add entities to the doc
doc.ents = filtered_spans
# Add the doc to the DocBin
db.add(doc)
return db
I tried to build an NER model but didn't got the expected output. I need help in the function
convert_to_spacy_format(data)