Skip to content

Commit 4417d45

Browse files
committed
Leitura de arquivos
1 parent b106eae commit 4417d45

File tree

3 files changed

+28
-7
lines changed

3 files changed

+28
-7
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
venv
1+
venv*

indexer.py

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,36 @@
11
import sys
22
import resource
33
import argparse
4+
import pathlib
5+
from warcio import ArchiveIterator
46

57
MEGABYTE = 1024 * 1024
68
def memory_limit(value):
79
limit = value * MEGABYTE
810
resource.setrlimit(resource.RLIMIT_AS, (limit, limit))
911

10-
def main():
12+
def main(my_args):
1113
"""
1214
Your main calls should be added here
1315
"""
14-
pass
16+
corpus_dir = pathlib.Path(my_args.corpus_path)
1517

16-
if __name__ == "__main__":
17-
parser = argparse.ArgumentParser(description='Process some integers.')
18+
assert corpus_dir.exists()
19+
20+
#Ler um arquivo warc
21+
22+
for file in corpus_dir.glob('*.warc.gz.kaggle'):
23+
24+
with open(file, 'rb') as stream:
25+
26+
for record in ArchiveIterator(stream):
27+
if record.rec_type == 'response':
28+
texto = record.raw_stream.read().decode()
29+
30+
#Para cada documento, calcular a frequência de palavras e adicionar no index
31+
#Tudo isso levando em consideração a memória utilizada
32+
33+
def configArgs(parser):
1834
parser.add_argument(
1935
'-m',
2036
dest='memory_limit',
@@ -39,12 +55,17 @@ def main():
3955
required=True,
4056
help='path of the index file to be generated'
4157
)
58+
return parser
59+
60+
if __name__ == "__main__":
61+
parser = argparse.ArgumentParser(description='Process some integers.')
62+
parser = configArgs(parser)
4263

4364
my_args = parser.parse_args()
4465

4566
memory_limit(my_args.memory_limit)
4667
try:
47-
main()
68+
main(my_args)
4869
except MemoryError:
4970
sys.stderr.write('\n\nERROR: Memory Exception\n')
5071
sys.exit(1)

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,4 @@ soupsieve==2.3.2.post1
1010
tqdm==4.64.0
1111
typing-extensions==4.2.0
1212
warcio==1.7.4
13-
zipp==3.8.0
13+
zipp==3.8.0

0 commit comments

Comments
 (0)