I am using a Doc2Vec model to calculate cosine similarity between observations in a dataset of website text. I want to be sure that my measure is roughly consistent if I instead use Fasttext (trained on my data) or Longformer (pre-trained) [I know they won't be identical]. However, the pairwise cosine similarity measure is strongly negatively correlated between Doc2Vec & Longformer or Doc2Vec & Fasttext. The measure is positively correlated between Longformer & Fasttext. Is there a reason why one might expect this? Am I mistakenly doing something in my code that could be causing this?
# PREPARE DATA
website_df = pd.read_csv(data_path+'cleaned_docdf_may2023.csv')
website_df[['documents_cleaned','website']]=website_df[['documents_cleaned','website']].astype(str)
website_df['documents_cleaned']=website_df['documents_cleaned'].str.lower()
website_df['documents_cleaned']=website_df['documents_cleaned'].str.strip()
#######################
# Train Doc2vec model
#######################
# Clean data for model input (trim long docs, lower case, tokenize):
counter = 0
all_docs = []
all_docs_simple = []
for train_doc in website_df.documents_cleaned:
doc = train_doc[:150000] if len(train_doc) > 150000 else train_doc
# clean using simple_preprocess for Fasttext model input
simple_pre = gensim.utils.simple_preprocess(train_doc)
doc = remove_stopwords(doc)
doc_tokens =nltk.word_tokenize(doc.lower())
all_docs.append(doc_tokens)
all_docs_simple.append(simple_pre)
if (counter%100) == 0:
print("{0} .. len: {1}".format(counter,len(doc)))
counter += 1
# Creating all tagged documents
documents_websites = [TaggedDocument(doc, [i]) for i, doc in enumerate(all_docs)]
documents_simplepre_websites = [TaggedDocument(doc, [i]) for i, doc in enumerate(all_docs_simple)]
print("\t. Run model")
doc2vec_model_websites = Doc2Vec(documents = documents_websites,
vector_size=700,
window=7,
min_count =3)
print("\t. Done")
doc2vec_model_websites.save(data_path + "doc2vec_websites.model")
# Grab document level vectors
vectors_d2v_websites = doc2vec_model_websites.dv.get_normed_vectors()
#########################
# FASTTEXT MODEL
#########################
# create and save Fasttext input
sent_df_websites=pd.Series(documents_simplepre_websites)
with open(data_path + 'sentences_websites', 'a') as f:
df_string = sent_df_websites.to_string(header=False, index=False)
f.write(df_string)
# Skipgram model (use comparable model parameters to doc2vec model) :
ft_model_sg_websites = fasttext.train_unsupervised(input=sent_df_websites, model='skipgram', ws=7, epoch=10, minCount=3)
ft_model_sg_websites.save_model(data_path + "ft_websites_sg.bin")
# cbow model (use comparable model parameters to doc2vec model) :
ft_model_cbow_websites = fasttext.train_unsupervised(input=data_path + 'sentences_websites', model='cbow', ws=7, epoch=10, minCount=3)
ft_model_cbow_websites.save_model(data_path + "ft_websites_cbow.bin")
def generateVector(sentence):
return ft.get_sentence_vector(sentence)
ft = ft_model_sg_websites
website_df['embeddings_sg'] = website_df['documents_cleaned'].apply(generateVector)
embeddings_sg_website=website_df['embeddings_sg']
ft = ft_model_cbow_websites
website_df['embeddings_cbow'] = website_df['documents_cleaned'].apply(generateVector)
embeddings_cbow_website=website_df['embeddings_sg']
#########################
# LONGFORMER
#########################
model_name = 'allenai/longformer-base-4096'
tokenizer = LongformerTokenizer.from_pretrained(model_name)
model = LongformerModel.from_pretrained(model_name)
def get_longformer_embeddings(text):
encoded_input = tokenizer(text, return_tensors="pt", max_length=4096, truncation=True)
output = model(**encoded_input, output_hidden_states=True)
embeddings = output.last_hidden_state
avg_emb = embeddings.mean(dim=1)
return avg_emb.cpu().detach().numpy()
def get_cosine_sim(a,b):
value = dot(a, b)/(norm(a)*norm(b))
return value
# subset data for speed during pilot
website_subset = website_df[:100]
website_subset['embeddings'] = website_subset['documents_cleaned'].apply(get_longformer_embeddings)
#########################
# EVALUATE CONSISTENCY BETWEEN MODELS
#########################
# create dataframe of random pairwise combinations
rand = np.random.randint(1,100,size=(500,2))
df = pd.DataFrame(rand, columns=['rand1', 'rand2'])
df['sim_lf']=0
df['sim_dv']=0
df['sim_ft_sg']=0
df['sim_ft_cbow']=0
for ind in df.index:
a_loc = df['rand1'][ind]
b_loc = df['rand2'][ind]
a_vec_dv = vectors_d2v_websites[a_loc]
b_vec_dv = vectors_d2v_websites[b_loc]
a_vec_ft_sg = embeddings_sg_website[a_loc]
b_vec_ft_sg = embeddings_sg_website[b_loc]
a_vec_ft_cbow = embeddings_cbow_website[a_loc]
b_vec_ft_cbow = embeddings_cbow_website[b_loc]
a_vec_lf = website_subset['embeddings'][a_loc]
b_vec_lf = website_subset['embeddings'][b_loc].T
cos_sim_lf = get_cosine_sim(a_vec_lf, b_vec_lf)
cos_sim_dv = get_cosine_sim(a_vec_dv, b_vec_dv)
cos_sim_ft_sg = get_cosine_sim(a_vec_ft_sg,b_vec_ft_sg)
cos_sim_ft_cbow = get_cosine_sim(a_vec_ft_cbow,b_vec_ft_cbow)
df['sim_lf'][ind]=cos_sim_lf
df['sim_dv'][ind]=cos_sim_dv
df['sim_ft_sg'][ind]=cos_sim_ft_sg
df['sim_ft_cbow'][ind]=cos_sim_ft_cbow
print('MY WEBSITE DATA SIMILARITY')
print('corr(Longformer, Fasttext (skipgram)) = ',df['sim_lf'].corr(df['sim_ft_sg']))
print('corr(Longformer, Fasttext (cbow)) = ',df['sim_lf'].corr(df['sim_ft_cbow']))
print('corr(Longformer, d2v) = ',df['sim_lf'].corr(df['sim_dv']))
print('corr(Fasttext (skipgram), d2v) = ',df['sim_ft_sg'].corr(df['sim_dv']))
print('corr(Fasttext (cbow), d2v) = ',df['sim_ft_cbow'].corr(df['sim_dv']))