Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve logging #4

Merged
merged 1 commit into from
May 29, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions progetto-tesi/author_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,19 @@

from scopus import AuthorSearch, AuthorRetrieval

from helpers import printl


# IMPROVE: use API 'field' attribute to only return used fileds


def find_authors(authors):
auths = list()
printl('Getting authors')
for author in authors:
auths.append(find_author(author))
printl('.')
print(' Done')
return list(filter(None, auths)), sum(True for a in auths if a is None)


Expand Down
11 changes: 8 additions & 3 deletions progetto-tesi/committee_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@

import probablepeople as pp

from config import HEADINGS, P_PROGRAM_HEADINGS
from config import HEADINGS, P_PROGRAM_HEADINGS, NER_LOSS_THRESHOLD
from util.helpers import findall
from util.webutil import polish_html
from models import Author

from helpers import printl


def extract_program_sections(text):
"""Gets all the sections referring to a program committee"""
Expand Down Expand Up @@ -98,14 +100,16 @@ def extract_committee(program_sections, nlp):
# See: https://stackoverflow.com/questions/38263384/how-to-save-spacy-model-onto-cache
# See: https://github.com/explosion/spaCy/issues/3054

printl('Extracting program committee')
# threshold over which we can say the NER lost a significant amount of names
loss_threshold = 0.7
loss_threshold = NER_LOSS_THRESHOLD
program_committee = list()
for section in program_sections:
n_section_people = list()
step = 0
text_lines = section.splitlines()
while True:
printl('.')
# run NER every `step` + offset lines and check if the result set is
# significantly reduced
n_step_people = list()
Expand Down Expand Up @@ -171,5 +175,6 @@ def extract_committee(program_sections, nlp):
program_committee += p_to_add
else:
program_committee += section_people


print(' Done')
return program_committee
20 changes: 12 additions & 8 deletions progetto-tesi/conference_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import paper_manager
import util.webutil as webutil
from models import Conference, Author, Paper
from helpers import printl


base_url = 'http://www.wikicfp.com'
Expand Down Expand Up @@ -85,16 +86,17 @@ def search_conference(conf, lower_boundary=5, exclude_current_year=True):

def get_subject_areas(conference):
subject_areas = []
print("Getting conference subject areas", end="", flush=True)
printl("Getting conference subject areas")
for paper in conference.papers:
paper = AbstractRetrieval(paper.scopus_id, view="FULL")
subject_areas += [s.code for s in paper.subject_areas]
print(".", end="", flush=True)
printl(".")

printl(" Done")
return list(set(subject_areas))


def add_conference(conf, nlp):
def add_conference(conf, nlp, precise=False):
if Conference.objects(wikicfp_id=conf.wikicfp_id):
return

Expand All @@ -108,10 +110,10 @@ def add_conference(conf, nlp):
if not program_committee:
# Having a conference without program committee means we can't compare
# the references, therefore there's no point in having it saved to db.
print('Program committee not found')
print('Program committee not found. Skipping conference.')
return None

print('PROGRAM COMMITTEE EXTRACTION:\nFound: {0}, Without affiliation: {1}'
print('Program committee extraction: found {0}, {1} without affiliation.'
.format(len(program_committee),
len([p for p in program_committee if len(p.affiliation) < 2])))

Expand Down Expand Up @@ -150,8 +152,7 @@ def add_conference(conf, nlp):
conf.processing_status = "committee_extracted"
conf.save()

print('AUTHORS EXTRACTION:\n'
'Total authors extracted: {0} Total not extracted: {1}'
print('Author extraction: extracted {0}. Total not extracted: {1}'
.format(len(authors), authors_not_found))

# save conference papers to db
Expand All @@ -172,13 +173,14 @@ def add_conference(conf, nlp):
conf.modify(set__papers=papers_to_add,
set__processing_status='papers_extracted')

print(f'PAPERS EXTRACTION: \nTotal papers extracted: {len(papers)}, '
print(f'Papers extraction: extracted {len(papers)} papers. '
f'Papers already in db: {papers_already_in_db}')

# get conference's subject areas
subject_areas = conference_manager.get_subject_areas(conf)
conf.modify(set__subject_areas=subject_areas)

printl('Getting references from papers')
# save references to db
ref_to_committee = 0
ref_not_to_committee_db = 0
Expand Down Expand Up @@ -210,7 +212,9 @@ def add_conference(conf, nlp):

ref_to_committee += len(paper.committee_refs)
paper.save()
printl('.')

print(' Done')
conf.processing_status = 'complete'
conf.save()

Expand Down
3 changes: 2 additions & 1 deletion progetto-tesi/config.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
HEADINGS = ["committee", "commission"]
PROGRAM_HEADINGS = ["program", "programme"]
P_PROGRAM_HEADINGS = [f'{ph} {h}' for h in HEADINGS for ph in PROGRAM_HEADINGS]
P_PROGRAM_HEADINGS = [f'{ph} {h}' for h in HEADINGS for ph in PROGRAM_HEADINGS]
NER_LOSS_THRESHOLD = 0.7
8 changes: 4 additions & 4 deletions progetto-tesi/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@
from models import Conference


def _add_conferences():
def _add_conferences(precise=True):
nlp = spacy.load('en_core_web_sm')

confs = conference_manager.load_from_xlsx("./progetto-tesi/data/cini.xlsx")[1:2]
for conf in confs:
conf_editions = conference_manager.search_conference(conf)
for edition in conf_editions:
print(f'### BEGIN conference: {edition.acronym} {edition.year} ###')
conference_manager.add_conference(edition, nlp)
conference_manager.add_conference(edition, nlp, precise=False)


def _add_authors_stats():
Expand All @@ -28,6 +28,6 @@ def _add_authors_stats():
if __name__ == "__main__":
connect('tesi-triennale')

_add_conferences()
_add_conferences(precise=False)
# stats_manager.plot_refs()
# _add_authors_stats()
_add_authors_stats()
12 changes: 5 additions & 7 deletions progetto-tesi/util/dblp.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
import sys
import time
import urllib

from bs4 import BeautifulSoup
from fuzzywuzzy import process
from fuzzywuzzy import fuzz

Expand All @@ -12,6 +9,7 @@

score_threshold = 70


def find_author(person_to_find):
base_url = "https://dblp.org/search"
query = urllib.parse.urlencode({"q": person_to_find.name})
Expand All @@ -28,7 +26,7 @@ def find_author(person_to_find):
# first ul, either contains the exact matches or likely matches
possible_people = list()
for li in html.select("#completesearch-authors > .body ul")[0].select('li'):
possible_people.append(Person(
possible_people.append(Author(
name="".join([m.getText() for m in li.select('a mark')]),
affiliation=li.select('small')[0].getText() if li.select('small') else "",
dblp_url=li.select('a')[0]['href']
Expand All @@ -41,7 +39,7 @@ def find_author(person_to_find):
def find_right_person(person_to_find, people_list, is_exact):
result_message = {
"status": "ok",
"is_exact": is_exact # True if exact match, False if likely match
"is_exact": is_exact # True if exact match, False if likely match
}

affiliations = {i: r.affiliation for i, r in enumerate(people_list)}
Expand All @@ -56,7 +54,7 @@ def find_right_person(person_to_find, people_list, is_exact):
# TODO: gestire il caso likely_match

_, fuzz_score, best_index = process.extractOne(person_to_find.affiliation, affiliations, scorer=fuzz.token_set_ratio)

if fuzz_score > score_threshold or (len(affiliations) == 1 and fuzz_score == 0):
# regular best match or only one match (no affiliation)
result_message["result"] = people_list[best_index]
Expand All @@ -78,7 +76,7 @@ def find_right_person(person_to_find, people_list, is_exact):
"err": "wrong_affiliation",
"is_exact": is_exact
}

return result_message


Expand Down
10 changes: 7 additions & 3 deletions progetto-tesi/util/helpers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
def findall(p, s):
'''Yields all the positions of
the pattern p in the string s.'''
'''Yields all the positions of the pattern p in the string s.'''
i = s.find(p)
while i != -1:
yield i
i = s.find(p, i+1)
i = s.find(p, i+1)


def printl(msg):
'''Prints msg on the same line as previous printl messages.'''
print(msg, end="", flush=True)