File tree Expand file tree Collapse file tree 3 files changed +28
-7
lines changed Expand file tree Collapse file tree 3 files changed +28
-7
lines changed Original file line number Diff line number Diff line change 1
- venv
1
+ venv *
Original file line number Diff line number Diff line change 1
1
import sys
2
2
import resource
3
3
import argparse
4
+ import pathlib
5
+ from warcio import ArchiveIterator
4
6
5
7
MEGABYTE = 1024 * 1024
6
8
def memory_limit (value ):
7
9
limit = value * MEGABYTE
8
10
resource .setrlimit (resource .RLIMIT_AS , (limit , limit ))
9
11
10
- def main ():
12
+ def main (my_args ):
11
13
"""
12
14
Your main calls should be added here
13
15
"""
14
- pass
16
+ corpus_dir = pathlib . Path ( my_args . corpus_path )
15
17
16
- if __name__ == "__main__" :
17
- parser = argparse .ArgumentParser (description = 'Process some integers.' )
18
+ assert corpus_dir .exists ()
19
+
20
+ #Ler um arquivo warc
21
+
22
+ for file in corpus_dir .glob ('*.warc.gz.kaggle' ):
23
+
24
+ with open (file , 'rb' ) as stream :
25
+
26
+ for record in ArchiveIterator (stream ):
27
+ if record .rec_type == 'response' :
28
+ texto = record .raw_stream .read ().decode ()
29
+
30
+ #Para cada documento, calcular a frequência de palavras e adicionar no index
31
+ #Tudo isso levando em consideração a memória utilizada
32
+
33
+ def configArgs (parser ):
18
34
parser .add_argument (
19
35
'-m' ,
20
36
dest = 'memory_limit' ,
@@ -39,12 +55,17 @@ def main():
39
55
required = True ,
40
56
help = 'path of the index file to be generated'
41
57
)
58
+ return parser
59
+
60
+ if __name__ == "__main__" :
61
+ parser = argparse .ArgumentParser (description = 'Process some integers.' )
62
+ parser = configArgs (parser )
42
63
43
64
my_args = parser .parse_args ()
44
65
45
66
memory_limit (my_args .memory_limit )
46
67
try :
47
- main ()
68
+ main (my_args )
48
69
except MemoryError :
49
70
sys .stderr .write ('\n \n ERROR: Memory Exception\n ' )
50
71
sys .exit (1 )
Original file line number Diff line number Diff line change @@ -10,4 +10,4 @@ soupsieve==2.3.2.post1
10
10
tqdm == 4.64.0
11
11
typing-extensions == 4.2.0
12
12
warcio == 1.7.4
13
- zipp == 3.8.0
13
+ zipp == 3.8.0
You can’t perform that action at this time.
0 commit comments