-
Notifications
You must be signed in to change notification settings - Fork 10
/
6. review.py
28 lines (22 loc) · 822 Bytes
/
6. review.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import codecademylib3_seaborn
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from raven import the_raven_stanzas
from preprocessing import preprocess_text
# view first stanza
print(the_raven_stanzas[0])
# preprocess documents
processed_stanzas = [preprocess_text(stanza) for stanza in the_raven_stanzas]
# initialize and fit TfidfVectorizer
vectorizer = TfidfVectorizer(norm=None)
tfidf_scores = vectorizer.fit_transform(processed_stanzas)
# get vocabulary of terms
feature_names = vectorizer.get_feature_names()
# get stanza index
stanza_index = [f"Stanza {i+1}" for i in range(len(the_raven_stanzas))]
# create pandas DataFrame with tf-idf scores
try:
df_tf_idf = pd.DataFrame(tfidf_scores.T.todense(), index=feature_names, columns=stanza_index)
print(df_tf_idf)
except:
pass