-
Notifications
You must be signed in to change notification settings - Fork 1
/
mediawiki2text.sh
executable file
·45 lines (44 loc) · 1.71 KB
/
mediawiki2text.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!/bin/sh
### Converts mediawiki dumps to text corpus
### Usage: bzcat mediawikidump.xml.bz2 | ./mediawiki2text.sh
### By Jon Dehdari, 2010
perl -p -e '
s/\n/__NL__/g;
#s/<text ([^>]+>)/\n<text $1/g;
s/<\/text>/<\/text>\n/g;
s/<title>/\n<title>/g;
s/<\/title>/<\/title>/g;
' |
egrep '<title>' |
egrep -v '<title>[^<]+:' | # rm Meta pages
fgrep -iv '#redirect' | # rm redirects
perl -p -e ' # multi-line-spanning deletions
s/<\/title>.+?<text/<\/title>__NL__<text/g; # rm other header info
s/{\|.+?\|}//g; # rm tables
s/{{[^{]+?}}//g; # rm templates
s/{{[^{]+?}}//g; # rm templates
s/<div>.+?<\/div>//g; # rm line-spanning HTML thing
s/<ref>.+?<\/ref>//g; # rm line-spanning HTML thing
s/<inputbox>.+?<\/inputbox>//g; # rm line-spanning HTML thing
s/<center>.+?<\/center>//g; # rm line-spanning HTML thing
s/<.+?>//g; # rm HTML tags
s/__NL__/\n/g;
' |
perl -p -e ' # most deletions go here
s{<title>(.+)</title>}{\n# Title: $1}g;
s/<text [^>]+>//g;
s/<\/text>//g;
s/^[*#].*//g; # rm list items
s/^==.+//g; # rm section headers
s/\[http:\S+ (.+?)\]/$1/g; # rm ext links
s/\[\[[^:]+?\|(.+?)\]\]/$1/g; # rm wikilinks with alternate text
s/\[\[([^:]+?)\]\]/$1/g; # rm plain wikilinks
s/\[\[.+?:[^[]*?\]\]//g; # rm meta links (images, interwiki, categories, etc)
s/\[\[ *\]\](:?\S+| ?)//g; # rm empty wikilinks left over by previous deletions
s/\( *\) ?//g; # rm empty parentheses left over by previous deletions
s/"/"/g; # unHTMLize double-quotes
' |
perl -00 -p -e " # cleanup
s/'''//g; # rm italics & bold markers
s/\n{3,}/\n\n/g; # rm multiple newlines
"