-
Notifications
You must be signed in to change notification settings - Fork 0
/
Vector.java
62 lines (46 loc) · 1.74 KB
/
Vector.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
package similartweets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
public class Vector {
final String SPLIT = " |\\.|\\,|\\?|\\!|\\:|\\...|\\)|\\(|\\;|\\]|\\[|\\/|\\-|\\+";
private final String TWEET;
private String[] TWEET_WORDS;
private ArrayList<String> vectorTerms;
private double[] idfArray;
private HashMap<String, Double> WORD_WEIGHT;
public Vector(String tweet, ArrayList<String> terms, double[] idfArray) {
this.TWEET = tweet;
this.TWEET_WORDS = getWordsFromTweet(tweet);
this.vectorTerms = terms;
this.idfArray = idfArray;
WORD_WEIGHT = fillWeightList();
}
public HashMap<String, Double> getWeight() {
return this.WORD_WEIGHT;
}
private HashMap<String, Double> fillWeightList() {
HashMap<String, Double> word_weight = new HashMap<>();
for (int cur_term = 0; cur_term < vectorTerms.size(); cur_term++) {
String term = vectorTerms.get(cur_term);
int cur_tf = Collections.frequency(Arrays.asList(TWEET_WORDS), term);
double weight = Math.log10(1 + cur_tf) * idfArray[cur_term];
if (weight != 0.0) {
word_weight.put(term, weight);
}
}
return word_weight;
}
private String[] getWordsFromTweet(String tweet) {
return tweet.split(SPLIT);
}
/* alternative method for Collections.frequency().
*
* for (String current_word : TWEET_WORDS) {
* if(term.equals(current_word)){
* cur_tf++;
* }
* }
*/
}