Skip to content

Commit

Permalink
bug fix
Browse files Browse the repository at this point in the history
  • Loading branch information
tomleung1996 committed Jan 31, 2019
1 parent 06ce4b9 commit 80f9c1b
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 30 deletions.
4 changes: 2 additions & 2 deletions wos_crawler/model/wos_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class WosDocument(Base):
journal = Column(String(255))
journal_iso = Column(String(100))
publisher = Column(String(255))
volume = Column(Integer)
volume = Column(String(50)) # 有可能有AB卷
issue = Column(String(10)) # 因为可能有SI:Special Issue,所以是String
start_page = Column(String(10)) # 因为有可能是电子出版,页码包含E
end_page = Column(String(10))
Expand Down Expand Up @@ -171,7 +171,7 @@ class WosReference(Base):
first_author = Column(String(255))
pub_year = Column(Integer)
journal = Column(String(255))
volume = Column(Integer)
volume = Column(String(50)) # 有可能有AB卷
start_page = Column(String(100)) #因为有电子出版的可能,所以有可能是E开头
doi = Column(String(255))

Expand Down
8 changes: 7 additions & 1 deletion wos_crawler/parsers/bibtex/wos/customization.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def reference(document):
volume_pattern = re.compile(r'^v\d+$')
page_pattern = re.compile(r'^p\w*\d+$')
doi_pattern = re.compile(r'^doi \d+.+$')
year_pattern = re.compile(r'^\d+$')
year_pattern = re.compile(r'^\d{4}$')

result = []
references = document['cited-references'][1:-1].lower().replace('{[}', '[').replace('\\', '').split('\n')
Expand Down Expand Up @@ -308,6 +308,12 @@ def reference(document):
print(e)
exit(-1)

# 由于参考文献字段非常不规范,经常超长,所以使用截断
if first_author is not None and len(first_author) > 254:
first_author = first_author[:254]
if journal is not None and len(journal) > 254:
journal = journal[:254]


result.append((first_author, pub_year, journal, volume, start_page, doi))
document['cited-references'] = result
Expand Down
47 changes: 20 additions & 27 deletions wos_crawler/parsers/plaintext/wos/plaintex_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def parse_single(input_file=None, db_path=None):
volume_pattern = re.compile(r'^v\d+$')
page_pattern = re.compile(r'^p\w*\d+$')
doi_pattern = re.compile(r'^doi \d+.+$')
year_pattern = re.compile(r'^\d+$')
year_pattern = re.compile(r'^\d{4}$')

print('正在解析{}……'.format(input_file))

Expand Down Expand Up @@ -139,28 +139,7 @@ def parse_single(input_file=None, db_path=None):
funding_line += ' ' + line[3:]
else:
funding_line = line[3:]
# if wos_document.funding_text is not None:
# wos_document.funding_text += ' '+line[3:]
# else:
# wos_document.funding_text = line[3:]
elif cur_field == 'fx ':
# if tmp == cur_field:
# # 说明现在是第一行,需要处理上面暂存的内容了
# fundings = wos_document.funding_text.split('; ')
# for fund in fundings:
# pos = find_nth(fund, '[', -1)
# if pos != -1:
# funding = [fund[:pos], fund[pos:]]
# agent = funding[0]
# numbers = funding[1].replace('[', '').replace(']', '').split(', ')
# for number in numbers:
# f = WosFunding(agent, number)
# f.document = wos_document
# else:
# agent = fund
# f = WosFunding(agent, None)
# f.document = wos_document
# wos_document.funding_text = None
if wos_document.funding_text is not None:
wos_document.funding_text += ' ' + line[3:]
else:
Expand Down Expand Up @@ -240,17 +219,23 @@ def parse_single(input_file=None, db_path=None):
first_author = ref_split[0]
journal = ref_split[1]

# 由于参考文献字段非常不规范,经常超长,所以使用截断
if first_author is not None and len(first_author) > 254:
first_author = first_author[:254]
if journal is not None and len(journal) > 254:
journal = journal[:254]

ref = WosReference(first_author, pub_year, journal, volume, start_page, doi)
ref.document = wos_document

elif cur_field == 'nr ':
wos_document.reference_num = line[3:]
wos_document.reference_num = int(line[3:])
elif cur_field == 'tc ':
wos_document.cited_times = line[3:]
wos_document.cited_times = int(line[3:])
elif cur_field == 'u1 ':
wos_document.usage_180 = line[3:]
wos_document.usage_180 = int(line[3:])
elif cur_field == 'u2':
wos_document.usage_since_2013 = line[3:]
wos_document.usage_since_2013 = int(line[3:])
elif cur_field == 'pu ':
wos_document.publisher = line[3:]
elif cur_field == 'ji ':
Expand Down Expand Up @@ -286,27 +271,35 @@ def parse_single(input_file=None, db_path=None):
if keyword_line is not None:
keywords = keyword_line.split('; ')
for keyword in keywords:
if len(keyword) > 254:
keyword = keyword[:254]
key = WosKeyword(keyword)
key.document = wos_document
keyword_line = None

if keyword_plus_line is not None:
keyword_plus = keyword_plus_line.split('; ')
for kp in keyword_plus:
if len(kp) > 254:
kp = kp[:254]
keyp = WosKeywordPlus(kp)
keyp.document = wos_document
keyword_plus_line = None

if wos_category_line is not None:
categories = wos_category_line.split('; ')
for category in categories:
if len(category) > 254:
category = category[:254]
cat = WosCategory(category)
cat.document = wos_document
wos_category_line = None

if research_area_line is not None:
areas = research_area_line.split('; ')
for area in areas:
if len(area) > 254:
area = area[:254]
a = WosResearchArea(area)
a.document = wos_document
research_area_line = None
Expand Down Expand Up @@ -350,5 +343,5 @@ def parse(input_dir=None, db_path=None):


if __name__ == '__main__':
parse(input_dir=r'C:\Users\Tom\PycharmProjects\wos_crawler\output\advanced_query\2019-01-31-10.21.19',
parse(input_dir=r'C:\Users\Tom\PycharmProjects\wos_crawler\output\advanced_query\remain',
db_path='C:/Users/Tom/Desktop/test2.db')

0 comments on commit 80f9c1b

Please sign in to comment.