forked from khuyentran1401/Data-science
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
e6276f9
commit 9cb267c
Showing
10 changed files
with
242 additions
and
35 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
[flake8] | ||
ignore = E203, E266, E501, W503, F403, F401, E402 | ||
max-line-length = 89 | ||
max-complexity = 18 | ||
select = B,C,E,F,W,T4,B9 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,9 @@ dp* | |
mallet-2.0.8 | ||
.benchmarks | ||
wandb | ||
*.pkl | ||
*.zip | ||
|
||
|
||
# VSCode workspace | ||
*-workspace | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
[settings] | ||
line_length = 88 | ||
multi_line_output = 3 | ||
include_trailing_comma = True | ||
known_third_party = celery,django,environ,pyquery,pytz,redis,requests,rest_framework |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# See https://pre-commit.com for more information | ||
# See https://pre-commit.com/hooks.html for more hooks | ||
repos: | ||
- repo: https://github.com/ambv/black | ||
rev: stable | ||
hooks: | ||
- id: black | ||
language_version: python3.7 | ||
- repo: https://github.com/pre-commit/pre-commit-hooks | ||
rev: v2.0.0 | ||
hooks: | ||
- id: flake8 | ||
- repo: https://github.com/timothycrosley/isort | ||
rev: 4.3.21 | ||
hooks: | ||
- id: isort |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
[tool.black] | ||
py36 = true | ||
include = '\.pyi?$' | ||
exclude = ''' | ||
/( | ||
\.git | ||
| \.hg | ||
| \.mypy_cache | ||
| \.tox | ||
| \.venv | ||
| _build | ||
| buck-out | ||
| build | ||
| dist | ||
# The following are specific to Black, you probably don't want those. | ||
| blib2to3 | ||
| tests/data | ||
)/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
Data | ||
Twitter.zip |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import gdown | ||
import zipfile | ||
|
||
from os import listdir | ||
from os.path import isfile, join | ||
import xml.etree.ElementTree as ET | ||
|
||
def main(): | ||
|
||
load_data(url='https://drive.google.com/uc?id=1jI1cmxqnwsmC-vbl8dNY6b4aNBtBbKy3', | ||
output='Twitter.zip', | ||
path_train='Data/train/en', | ||
path_test='Data/test/en') | ||
|
||
|
||
def load_data(url: str, output: str, path_train: str, path_test: str): | ||
|
||
# Download data from Google Drive | ||
output = 'Twitter.zip' | ||
gdown.download(url, output, quiet=False) | ||
|
||
# Unzip data | ||
with zipfile.ZipFile(output, 'r') as zip_ref: | ||
zip_ref.extractall('.') | ||
|
||
# Get train, test data files | ||
tweets_train_files = [file for file in listdir(path_train) if isfile(join(path_train, file)) and file != "truth.txt"] | ||
tweets_test_files = [file for file in listdir(path_test) if isfile(join(path_test, file)) and file != "truth.txt"] | ||
|
||
# Extract texts from each file | ||
t_train = [] | ||
for file in tweets_train_files: | ||
train_doc_1 =[r.text for r in ET.parse(join(path_train, file)).getroot()[0]] | ||
t_train.append(' '.join(t for t in train_doc_1)) | ||
|
||
|
||
t_test = [] | ||
for file in tweets_test_files: | ||
test_doc_1 =[r.text for r in ET.parse(join(path_test, file)).getroot()[0]] | ||
t_test.append(' '.join(t for t in test_doc_1)) | ||
|
||
|
||
return t_train, t_test | ||
|
||
if __name__=='__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
import gdown | ||
import zipfile | ||
|
||
from os import listdir | ||
from os.path import isfile, join | ||
import xml.etree.ElementTree as ET | ||
|
||
from typing import Tuple, List | ||
|
||
def main(): | ||
|
||
url = 'https://drive.google.com/uc?id=1jI1cmxqnwsmC-vbl8dNY6b4aNBtBbKy3' | ||
output_path = 'Twitter.zip' | ||
path_train = 'Data/train/en' | ||
path_test = 'Data/test/en' | ||
|
||
data_getter = DataGetter(url, output_path, path_train, path_test) | ||
|
||
tweet_train, tweet_test = data_getter.get_train_test_docs() | ||
|
||
class DataGetter: | ||
def __init__(self, url: str, output_path: str, path_train: str, path_test: str): | ||
self.url = url | ||
self.output_path = output_path | ||
self.path_train = path_train | ||
self.path_test = path_test | ||
self.download_zip_data_from_google_drive() | ||
self.unzip_data() | ||
|
||
def download_zip_data_from_google_drive(self): | ||
|
||
gdown.download(self.url, self.output_path, quiet=False) | ||
|
||
def unzip_data(self): | ||
|
||
with zipfile.ZipFile(self.output_path, 'r') as zip_ref: | ||
zip_ref.extractall('.') | ||
|
||
def get_train_test_docs(self) -> Tuple[list, list]: | ||
|
||
tweets_train_files = self.get_files(self.path_train) | ||
tweets_test_files = self.get_files(self.path_test) | ||
|
||
t_train = self.extract_texts_from_multiple_files(self.path_train, tweets_train_files) | ||
t_test = self.extract_texts_from_multiple_files(self.path_test, tweets_test_files) | ||
return t_train, t_test | ||
|
||
|
||
@staticmethod | ||
def get_files(path: str) -> List[str]: | ||
|
||
return [file for file in listdir(path) if isfile(join(path, file)) and file != "truth.txt"] | ||
|
||
@classmethod | ||
def extract_texts_from_multiple_files(cls, path_to_file: str, files: list) -> List[str]: | ||
|
||
all_docs = [] | ||
for file in files: | ||
text_in_one_file = cls.extract_texts_from_each_file(path_to_file, file) | ||
all_docs.append(text_in_one_file) | ||
|
||
return all_docs | ||
|
||
@staticmethod | ||
def extract_texts_from_each_file(path_to_file: str, file_name: list) -> str: | ||
|
||
list_of_text_in_one_file =[r.text for r in ET.parse(join(path_to_file, file_name)).getroot()[0]] | ||
text_in_one_file_as_string = ' '.join(t for t in list_of_text_in_one_file) | ||
|
||
return text_in_one_file_as_string | ||
|
||
if __name__=='__main__': | ||
main() |
Binary file not shown.
Oops, something went wrong.