Skip to content

Commit 0f313a9

Browse files
committed
update the constant name and imports
Constants name is transformed into all uppercase and remove an unimportant comment in import and removed some unimportant imports in urllib module
1 parent eda86a2 commit 0f313a9

File tree

7 files changed

+51
-13
lines changed

7 files changed

+51
-13
lines changed

.idea/.gitignore

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/WikiExtractor.iml

Lines changed: 12 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/inspectionProfiles/profiles_settings.xml

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/misc.xml

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/modules.xml

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/vcs.xml

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

WikiExtractor.py

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -52,12 +52,11 @@
5252
import argparse
5353
import gc
5454
import sys
55-
import urllib.request, urllib.parse, urllib.error
55+
import urllib.request
5656
import re
5757
import bz2
5858
import os.path
5959
from html.entities import name2codepoint
60-
#import fnmatch
6160
import shutil
6261
import mimetypes
6362
import gzip
@@ -70,7 +69,7 @@
7069
### PARAMS ####################################################################
7170

7271
# This is obtained from the dump itself
73-
prefix = None
72+
PREFIX = None
7473

7574
##
7675
# Whether to preseve links in output
@@ -86,12 +85,12 @@
8685
# Recognize only these namespaces
8786
# w: Internal links to the Wikipedia
8887
#
89-
acceptedNamespaces = set(['w'])
88+
ACCEPTED_NAMESPACES= set(['w'])
9089

9190
##
9291
# Drop these elements from article text
9392
#
94-
discardElements = set([
93+
DISCARD_ELEMENTS = set([
9594
'gallery', 'timeline', 'noinclude', 'pre',
9695
'table', 'tr', 'td', 'th', 'caption',
9796
'form', 'input', 'select', 'option', 'textarea',
@@ -132,7 +131,7 @@
132131
## print(footer, file=out)
133132

134133
def WikiDocumentSentences(out, id, title, tags, text):
135-
url = get_url(id, prefix)
134+
url = get_url(id, PREFIX)
136135
header = '\n{0}:{1}'.format(title, "|||".join(tags))
137136
# Separate header from text with a newline.
138137
text = clean(text)
@@ -176,7 +175,7 @@ def normalizeTitle(title):
176175
rest = m.group(3)
177176

178177
ns = prefix.capitalize()
179-
if ns in acceptedNamespaces:
178+
if ns in ACCEPTED_NAMESPACES:
180179
# If the prefix designates a known namespace, then it might be
181180
# followed by optional whitespace that should be removed to get
182181
# the canonical page name
@@ -224,7 +223,7 @@ def fixup(m):
224223

225224
# Match elements to ignore
226225
discard_element_patterns = []
227-
for tag in discardElements:
226+
for tag in DISCARD_ELEMENTS:
228227
pattern = re.compile(r'<\s*%s\b[^>]*>.*?<\s*/\s*%s>' % (tag, tag), re.DOTALL | re.IGNORECASE)
229228
discard_element_patterns.append(pattern)
230229

@@ -353,7 +352,7 @@ def make_anchor_tag(match):
353352
global keepLinks
354353
link = match.group(1)
355354
colon = link.find(':')
356-
if colon > 0 and link[:colon] not in acceptedNamespaces:
355+
if colon > 0 and link[:colon] not in ACCEPTED_NAMESPACES:
357356
return ''
358357
trail = match.group(3)
359358
anchor = match.group(2)
@@ -587,7 +586,7 @@ def file_name(self):
587586

588587
def process_data(ftype, input, output_sentences, output_structure, incubator,
589588
vital_titles=None, vital_tags=None):
590-
global prefix
589+
global PREFIX
591590
page = []
592591
id = None
593592
inText = False
@@ -625,7 +624,7 @@ def process_data(ftype, input, output_sentences, output_structure, incubator,
625624
page.append(line)
626625
elif tag == '/page':
627626
colon = title.find(':')
628-
if (colon < 0 or title[:colon] in acceptedNamespaces) and \
627+
if (colon < 0 or title[:colon] in ACCEPTED_NAMESPACES) and \
629628
not redirect:
630629
if (not vital_titles) or (title in vital_titles):
631630
if((incubator != '') and (lang[1] == incubator) and len(lang) > 2):
@@ -648,7 +647,7 @@ def process_data(ftype, input, output_sentences, output_structure, incubator,
648647
# discover prefix from the xml dump file
649648
# /mediawiki/siteinfo/base
650649
base = m.group(3)
651-
prefix = base[:base.rfind("/")]
650+
PREFIX = base[:base.rfind("/")]
652651

653652
##def load_vital_titles(vitalfn):
654653
## """Given the filename for the vital titles list (one title per line, with
@@ -698,7 +697,7 @@ def get_argparser():
698697
return parser
699698

700699
def main():
701-
global keepLinks, keepSections, prefix, acceptedNamespaces
700+
global keepLinks, keepSections, PREFIX, ACCEPTED_NAMESPACES
702701
script_name = os.path.basename(sys.argv[0])
703702

704703
parser = get_argparser()

0 commit comments

Comments
 (0)