Skip to content

Commit

Permalink
Add indexes and multithreading
Browse files Browse the repository at this point in the history
  • Loading branch information
fabiosangregorio committed Jul 7, 2019
1 parent 51003f5 commit 85dd31c
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 50 deletions.
87 changes: 40 additions & 47 deletions progetto-tesi/conference_manager.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re
from datetime import datetime
import time
from multiprocessing import Pool

import xlrd
from fuzzywuzzy import fuzz
Expand Down Expand Up @@ -194,55 +195,47 @@ def add_conference(conf, nlp):

printl('Getting references from papers')
# save references to db
ref_to_committee = 0
ref_not_to_committee_db = 0
ref_not_to_committee_not_db = 0
times = []
for paper in conf.papers:
ref_eids = paper_manager.extract_references_from_paper(paper)
start = time.time()
for eid in ref_eids:
# if there's a reference to the program committee, get the pc author
found = False
for a in conf.program_committee:
if eid in a.eid_list:
paper.committee_refs.append(a)
found = True
break
if found:
continue
# else:
# auth = Author.objects(eid_list__in=eid).upsert_one(
# set_on_insert__eid_list=[eid])
# FIXME: check why upsert is not working
auth = AuthorIndex.objects(eid=eid).first()
# auth = Author.objects(eid_list__in=[eid]).first()
if auth:
auth = auth.author
ref_not_to_committee_db += 1
else:
auth = Author(eid_list=[eid]).save()
AuthorIndex(eid=eid, author=auth).save()
ref_not_to_committee_not_db += 1
paper.non_committee_refs.append(auth)
times.append(time.time() - start)
print("paper: ", time.time() - start)

if not ref_eids:
printl('x')
paper.delete()
continue

ref_to_committee += len(paper.committee_refs)
paper.save()
printl('.')
start = time.time()
pool = Pool()
pool.map(_save_paper_refs, [(p, conf) for p in conf.papers])
pool.close()
print("conf: ", time.time() - start)

print(' Done')
print("avg time per paper: ", sum(times) / len(times))
conf.processing_status = 'complete'
conf.save()

print(f'REFERENCES OF ALL PAPERS EXTRACTION: \nRefs to committee: '
f'{ref_to_committee}, Refs not to committee already in db: '
f'{ref_not_to_committee_db}, ref not to committee not in db: '
f'{ref_not_to_committee_not_db}')

def _save_paper_refs(data):
paper, conf = data
ref_eids = paper_manager.extract_references_from_paper(paper)
for eid in ref_eids:
# if there's a reference to the program committee, get the pc author
found = False
for a in conf.program_committee:
if eid in a.eid_list:
paper.committee_refs.append(a)
found = True
break
if found:
continue
# else:
# auth = Author.objects(eid_list__in=eid).upsert_one(
# set_on_insert__eid_list=[eid])
# FIXME: check why upsert is not working
auth = AuthorIndex.objects(eid=eid).first()
# auth = Author.objects(eid_list__in=[eid]).first()
if auth:
auth = auth.author
else:
auth = Author(eid_list=[eid]).save()
AuthorIndex(eid=eid, author=auth).save()
paper.non_committee_refs.append(auth)

if not ref_eids:
printl('x')
paper.delete()
return

paper.save()
printl('.')
2 changes: 1 addition & 1 deletion progetto-tesi/config.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
HEADINGS = ["committee", "commission"]
PROGRAM_HEADINGS = ["program", "programme"]
PROGRAM_HEADINGS = ["program", "programme", "review"]
P_PROGRAM_HEADINGS = [f'{ph} {h}' for h in HEADINGS for ph in PROGRAM_HEADINGS]
NER_LOSS_THRESHOLD = 0.7
4 changes: 2 additions & 2 deletions progetto-tesi/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
def _add_conferences():
nlp = spacy.load('en_core_web_sm')

confs = conference_manager.load_from_xlsx("./progetto-tesi/data/cini.xlsx")[1:4]
confs = conference_manager.load_from_xlsx("./progetto-tesi/data/cini.xlsx")[3:4]
for conf in confs:
conf_editions = conference_manager.search_conference(conf)
for edition in conf_editions:
Expand All @@ -25,8 +25,8 @@ def _add_authors_stats():
not_committee_mentions_ratio=stats.not_committee_ratio)


connect('tesi-triennale', alias="default")
if __name__ == "__main__":
connect('tesi-triennale')

_add_conferences()
# stats_manager.plot_refs()
Expand Down

0 comments on commit 85dd31c

Please sign in to comment.