5252import argparse
5353import gc
5454import sys
55- import urllib .request , urllib . parse , urllib . error
55+ import urllib .request
5656import re
5757import bz2
5858import os .path
5959from html .entities import name2codepoint
60- #import fnmatch
6160import shutil
6261import mimetypes
6362import gzip
7069### PARAMS ####################################################################
7170
7271# This is obtained from the dump itself
73- prefix = None
72+ PREFIX = None
7473
7574##
7675# Whether to preseve links in output
8685# Recognize only these namespaces
8786# w: Internal links to the Wikipedia
8887#
89- acceptedNamespaces = set (['w' ])
88+ ACCEPTED_NAMESPACES = set (['w' ])
9089
9190##
9291# Drop these elements from article text
9392#
94- discardElements = set ([
93+ DISCARD_ELEMENTS = set ([
9594 'gallery' , 'timeline' , 'noinclude' , 'pre' ,
9695 'table' , 'tr' , 'td' , 'th' , 'caption' ,
9796 'form' , 'input' , 'select' , 'option' , 'textarea' ,
132131## print(footer, file=out)
133132
134133def WikiDocumentSentences (out , id , title , tags , text ):
135- url = get_url (id , prefix )
134+ url = get_url (id , PREFIX )
136135 header = '\n {0}:{1}' .format (title , "|||" .join (tags ))
137136 # Separate header from text with a newline.
138137 text = clean (text )
@@ -176,7 +175,7 @@ def normalizeTitle(title):
176175 rest = m .group (3 )
177176
178177 ns = prefix .capitalize ()
179- if ns in acceptedNamespaces :
178+ if ns in ACCEPTED_NAMESPACES :
180179 # If the prefix designates a known namespace, then it might be
181180 # followed by optional whitespace that should be removed to get
182181 # the canonical page name
@@ -224,7 +223,7 @@ def fixup(m):
224223
225224# Match elements to ignore
226225discard_element_patterns = []
227- for tag in discardElements :
226+ for tag in DISCARD_ELEMENTS :
228227 pattern = re .compile (r'<\s*%s\b[^>]*>.*?<\s*/\s*%s>' % (tag , tag ), re .DOTALL | re .IGNORECASE )
229228 discard_element_patterns .append (pattern )
230229
@@ -353,7 +352,7 @@ def make_anchor_tag(match):
353352 global keepLinks
354353 link = match .group (1 )
355354 colon = link .find (':' )
356- if colon > 0 and link [:colon ] not in acceptedNamespaces :
355+ if colon > 0 and link [:colon ] not in ACCEPTED_NAMESPACES :
357356 return ''
358357 trail = match .group (3 )
359358 anchor = match .group (2 )
@@ -587,7 +586,7 @@ def file_name(self):
587586
588587def process_data (ftype , input , output_sentences , output_structure , incubator ,
589588 vital_titles = None , vital_tags = None ):
590- global prefix
589+ global PREFIX
591590 page = []
592591 id = None
593592 inText = False
@@ -625,7 +624,7 @@ def process_data(ftype, input, output_sentences, output_structure, incubator,
625624 page .append (line )
626625 elif tag == '/page' :
627626 colon = title .find (':' )
628- if (colon < 0 or title [:colon ] in acceptedNamespaces ) and \
627+ if (colon < 0 or title [:colon ] in ACCEPTED_NAMESPACES ) and \
629628 not redirect :
630629 if (not vital_titles ) or (title in vital_titles ):
631630 if ((incubator != '' ) and (lang [1 ] == incubator ) and len (lang ) > 2 ):
@@ -648,7 +647,7 @@ def process_data(ftype, input, output_sentences, output_structure, incubator,
648647 # discover prefix from the xml dump file
649648 # /mediawiki/siteinfo/base
650649 base = m .group (3 )
651- prefix = base [:base .rfind ("/" )]
650+ PREFIX = base [:base .rfind ("/" )]
652651
653652##def load_vital_titles(vitalfn):
654653## """Given the filename for the vital titles list (one title per line, with
@@ -698,7 +697,7 @@ def get_argparser():
698697 return parser
699698
700699def main ():
701- global keepLinks , keepSections , prefix , acceptedNamespaces
700+ global keepLinks , keepSections , PREFIX , ACCEPTED_NAMESPACES
702701 script_name = os .path .basename (sys .argv [0 ])
703702
704703 parser = get_argparser ()
0 commit comments