Skip to content

Commit

Permalink
good functions example
Browse files Browse the repository at this point in the history
  • Loading branch information
khuyentran1401 committed Jan 20, 2021
1 parent e6276f9 commit 9cb267c
Show file tree
Hide file tree
Showing 10 changed files with 242 additions and 35 deletions.
5 changes: 5 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[flake8]
ignore = E203, E266, E501, W503, F403, F401, E402
max-line-length = 89
max-complexity = 18
select = B,C,E,F,W,T4,B9
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ dp*
mallet-2.0.8
.benchmarks
wandb
*.pkl
*.zip


# VSCode workspace
*-workspace
Expand Down
5 changes: 5 additions & 0 deletions .isort.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[settings]
line_length = 88
multi_line_output = 3
include_trailing_comma = True
known_third_party = celery,django,environ,pyquery,pytz,redis,requests,rest_framework
16 changes: 16 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
repos:
- repo: https://github.com/ambv/black
rev: stable
hooks:
- id: black
language_version: python3.7
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v2.0.0
hooks:
- id: flake8
- repo: https://github.com/timothycrosley/isort
rev: 4.3.21
hooks:
- id: isort
19 changes: 19 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[tool.black]
py36 = true
include = '\.pyi?$'
exclude = '''
/(
\.git
| \.hg
| \.mypy_cache
| \.tox
| \.venv
| _build
| buck-out
| build
| dist
# The following are specific to Black, you probably don't want those.
| blib2to3
| tests/data
)/
2 changes: 2 additions & 0 deletions python/good_functions/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Data
Twitter.zip
46 changes: 46 additions & 0 deletions python/good_functions/bad_code.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import gdown
import zipfile

from os import listdir
from os.path import isfile, join
import xml.etree.ElementTree as ET

def main():

load_data(url='https://drive.google.com/uc?id=1jI1cmxqnwsmC-vbl8dNY6b4aNBtBbKy3',
output='Twitter.zip',
path_train='Data/train/en',
path_test='Data/test/en')


def load_data(url: str, output: str, path_train: str, path_test: str):

# Download data from Google Drive
output = 'Twitter.zip'
gdown.download(url, output, quiet=False)

# Unzip data
with zipfile.ZipFile(output, 'r') as zip_ref:
zip_ref.extractall('.')

# Get train, test data files
tweets_train_files = [file for file in listdir(path_train) if isfile(join(path_train, file)) and file != "truth.txt"]
tweets_test_files = [file for file in listdir(path_test) if isfile(join(path_test, file)) and file != "truth.txt"]

# Extract texts from each file
t_train = []
for file in tweets_train_files:
train_doc_1 =[r.text for r in ET.parse(join(path_train, file)).getroot()[0]]
t_train.append(' '.join(t for t in train_doc_1))


t_test = []
for file in tweets_test_files:
test_doc_1 =[r.text for r in ET.parse(join(path_test, file)).getroot()[0]]
t_test.append(' '.join(t for t in test_doc_1))


return t_train, t_test

if __name__=='__main__':
main()
73 changes: 73 additions & 0 deletions python/good_functions/good_code.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import gdown
import zipfile

from os import listdir
from os.path import isfile, join
import xml.etree.ElementTree as ET

from typing import Tuple, List

def main():

url = 'https://drive.google.com/uc?id=1jI1cmxqnwsmC-vbl8dNY6b4aNBtBbKy3'
output_path = 'Twitter.zip'
path_train = 'Data/train/en'
path_test = 'Data/test/en'

data_getter = DataGetter(url, output_path, path_train, path_test)

tweet_train, tweet_test = data_getter.get_train_test_docs()

class DataGetter:
def __init__(self, url: str, output_path: str, path_train: str, path_test: str):
self.url = url
self.output_path = output_path
self.path_train = path_train
self.path_test = path_test
self.download_zip_data_from_google_drive()
self.unzip_data()

def download_zip_data_from_google_drive(self):

gdown.download(self.url, self.output_path, quiet=False)

def unzip_data(self):

with zipfile.ZipFile(self.output_path, 'r') as zip_ref:
zip_ref.extractall('.')

def get_train_test_docs(self) -> Tuple[list, list]:

tweets_train_files = self.get_files(self.path_train)
tweets_test_files = self.get_files(self.path_test)

t_train = self.extract_texts_from_multiple_files(self.path_train, tweets_train_files)
t_test = self.extract_texts_from_multiple_files(self.path_test, tweets_test_files)
return t_train, t_test


@staticmethod
def get_files(path: str) -> List[str]:

return [file for file in listdir(path) if isfile(join(path, file)) and file != "truth.txt"]

@classmethod
def extract_texts_from_multiple_files(cls, path_to_file: str, files: list) -> List[str]:

all_docs = []
for file in files:
text_in_one_file = cls.extract_texts_from_each_file(path_to_file, file)
all_docs.append(text_in_one_file)

return all_docs

@staticmethod
def extract_texts_from_each_file(path_to_file: str, file_name: list) -> str:

list_of_text_in_one_file =[r.text for r in ET.parse(join(path_to_file, file_name)).getroot()[0]]
text_in_one_file_as_string = ' '.join(t for t in list_of_text_in_one_file)

return text_in_one_file_as_string

if __name__=='__main__':
main()
Binary file removed visualization/github/test_df.zip
Binary file not shown.
Loading

0 comments on commit 9cb267c

Please sign in to comment.