forked from mlfoundations/dclm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathc4.yaml
77 lines (77 loc) · 2.33 KB
/
c4.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
- source: cc_april_2019
weight: 1.0
steps:
# Chunk 1: Various filtering and cleaning rules (local)
- func: move_url_modifier # Necessary because 'url' was not set up in these jsonls, was in page['metadata']['WARC-Target-URI']
- func: key_name_modifier
- func: page_length_filter
length_type: char
max_length: 190000
- func: word_length_modifier
max_length: 1000
model: split
- func: citation_removal_modifier
- func: punctuation_line_modifier
remove_ellipses: True
- func: line_length_modifier
min_length: 5
- func: substring_filter
banlist: ['lorem ipsum']
- func: substring_line_modifier
banlist: ['javascript']
- func: substring_filter
banlist: ['{']
- func: substring_line_modifier
banlist: [
"terms of use",
"privacy policy",
"cookie policy",
"uses cookies",
"use of cookies",
"use cookies"
]
- func: page_length_filter
length_type: sentence
min_length: 3
tokenizer: nltk
tokenizer_lang: english
- func: split_lines_modifier
delimiter: "\n"
# Chunk 2: URL Dedup
- func: exact_dedup
content_key: url
normalize: normalize_url
# Chunk 3: Paragraph Dedup
- func: exact_dedup
content_key: text
normalize: normalize_whitespace_and_lowercase
selection_key: url
selection_normalize: hash_text
# Chunk 4: Joining after dedup + LID + Bad word filtering
- func: join_lines_modifier
delimiter: "\n"
- func: within_page_dedup # TODO: Given our implementation I think we can delete this...
granularity: line
normalize: True
- func: page_length_filter
length_type: sentence
min_length: 3
tokenizer: nltk
tokenizer_lang: english
- func: substring_filter
banlist_from_fname: "baselines/mappers/banlists/ldnoobw.txt"
exact_word: True
- func: detect_lang_whole_page_enricher
model: langdetect
key_prefix: language_id_whole_page
seed: 0
_aggregate:
language_id_whole_page_langdetect:
type: histogram
transform: threshold_transform
threshold: 0.99
default: "unknown"
- func: language_filter
key: language_id_whole_page_langdetect
keep_languages: [ en ]
threshold: 0.99