-
Notifications
You must be signed in to change notification settings - Fork 0
/
word_count.py
126 lines (83 loc) · 2.32 KB
/
word_count.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import re
from math import log
import os
import nltk
nltk.download('punkt')
from nltk import word_tokenize,sent_tokenize
corpus = """
This is a sentence. This is another sentence. There are three sentences.
"""
def preprocessing(doc):
doc = re.sub("\n", " ", doc)
doc = re.sub("[^A-Za-z0-9]+", " ", doc)
doc = re.sub(" +", " ", doc)
doc = doc.strip()
doc = doc.lower()
return(doc)
sentences = sent_tokenize(corpus)
dictionary = {}
for sentence in sentences:
dictionary[sentence] = {}
for sentence in dictionary:
sentence = preprocessing(sentence)
terms = word_tokenize(sentence)
# lemmatize/stem terms
for word_token in word_tokens:
dictionary[sentence][word_token] = {}
print(dictionary)
# Split original text to sentences and add to dict
# Clean sentences, remove stopwords and split to list
# Lemmatize/stem terms
# Add words to dict
# Calculate tfidf per word
# Calculate tfidf-score per sentence
# Calculate treshold score (e.g. average score per sentence)
# Return all sentences above treshold
# docs = []
# for __, __, files in os.walk(os.getcwd(), topdown=False):
# for file in files:
# if ".txt" in file:
# f = open(file, "r")
# content = f.read()
# docs.append(content)
# def tokenize(doc):
# sentence = re.split('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', doc)
# return(tokenized_sentences)
# def preprocessing(doc):
# doc = re.sub("\n", " ", doc)
# doc = re.sub("[^A-Za-z0-9]+", " ", doc)
# doc = re.sub(" +", " ", doc)
# doc = doc.strip()
# doc = doc.lower()
# return(doc)
# tfidf = {}
# def tf(terms):
# for term in terms:
# if term not in tfidf:
# tfidf[term] = {}
# tfidf[term]["termFrequency"] = 1
# elif term in tfidf:
# tfidf[term]["termFrequency"] += 1
# return
# def df(terms, docs):
# doc_list = []
# for doc in docs:
# doc = preprocessing(doc)
# doc_list.append(doc)
# for term in terms:
# tfidf[term]["docFrequency"] = 1
# for doc in docs:
# if term in doc:
# tfidf[term]["docFrequency"] += 1
# return
# def calc(tfidf):
# term_list = []
# for term in tfidf:
# tf = tfidf[term]["termFrequency"]
# doc_len = len(tfidf)
# df = tfidf[term]["docFrequency"]
# docs_len = len(docs)
# tfidf_calc = (tf/doc_len) * log(docs_len/df)
# term_list.append([tfidf_calc, term])
# results = sorted(term_list, reverse=True)[:10]
# print(results)