File tree Expand file tree Collapse file tree 1 file changed +2
-11
lines changed Expand file tree Collapse file tree 1 file changed +2
-11
lines changed Original file line number Diff line number Diff line change 34
34
35
35
# load the reviews
36
36
# data courtesy of http://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html
37
- positive_reviews = BeautifulSoup (open ('electronics/positive.review' ).read ())
37
+ positive_reviews = BeautifulSoup (open ('electronics/positive.review' ).read (), features = "html5lib" )
38
38
positive_reviews = positive_reviews .findAll ('review_text' )
39
39
40
- negative_reviews = BeautifulSoup (open ('electronics/negative.review' ).read ())
40
+ negative_reviews = BeautifulSoup (open ('electronics/negative.review' ).read (), features = "html5lib" )
41
41
negative_reviews = negative_reviews .findAll ('review_text' )
42
42
43
- # there are more positive reviews than negative reviews
44
- # so let's take a random sample so we have balanced classes
45
- # np.random.shuffle(positive_reviews)
46
- # positive_reviews = positive_reviews[:len(negative_reviews)]
47
43
48
- # we can also oversample the negative reviews
49
- diff = len (positive_reviews ) - len (negative_reviews )
50
- idxs = np .random .choice (len (negative_reviews ), size = diff )
51
- extra = [negative_reviews [i ] for i in idxs ]
52
- negative_reviews += extra
53
44
54
45
# first let's just try to tokenize the text using nltk's tokenizer
55
46
# let's take the first review for example:
You can’t perform that action at this time.
0 commit comments