From 159d17e52a49a8d41800c909b8743a31a76ce9e3 Mon Sep 17 00:00:00 2001 From: iBotamon Date: Sat, 7 Sep 2019 14:02:14 +0900 Subject: [PATCH 1/2] medial_ai.py -> medical/ai.py --- medical/ai.py | 283 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 283 insertions(+) create mode 100644 medical/ai.py diff --git a/medical/ai.py b/medical/ai.py new file mode 100644 index 0000000..2a938df --- /dev/null +++ b/medical/ai.py @@ -0,0 +1,283 @@ +# -*- coding: utf-8 -*- + +import argparse +import bs4 +import collections +import pyperclip +import sys +import urllib +import urllib.request + + +# keyword-based classification of medical/non-medical AI papers + +def title_is_medical(title): + keywords = ['medic', 'biomedic', 'bioMedic', 'health', 'clinic', 'EHR', 'MeSH', 'RCT', 'life', 'care', 'pharm', 'food-drug', 'drug', 'surg',\ + 'emergency', 'ICU', 'hospital', 'patient', 'doctor', 'disease', 'illness', 'symptom', 'treatment',\ + 'cancer', 'psycholog', 'psychiat', 'mental', 'radiol', 'patho', 'autopsy', 'x-ray', 'x-Ray', 'mammogr', 'CT', 'MRI', 'radiograph', 'tomograph',\ + 'magnetic'] + + for keyword in keywords: + for kw in (keyword, keyword.upper(), keyword.capitalize()): + if (((' ' + kw) in title) or title.startswith(kw)): + return True + else: + continue + + return False + + + +# get args when executed via command-line + +def get_args(): + description=''' +++++++++++++++++++++++++++++++++++++++++++++++++++ +Pickup medical AI paper titles and URLs from specified conference and year. +会議名と年数を指定すると, 医療に関連するAI論文のみを探し出してタイトルとURLを列挙します. + +To get from ACL 2019, input like this: python3 medical_ai.py acl 2019 +例えばACL 2019採択論文から探すには本プログラムを python3 medical medical_ai.py acl 2019 と実行してください. + +Conference name is case insensitive. +会議名は大文字でも小文字でも構いません. + +To output HTML link tags or markdown links, use options below. +以下に示すオプションを使うと, 結果をHTMLリンクタグやMarkdownリンクとして出力することも可能です. +++++++++++++++++++++++++++++++++++++++++++++++++++ + ''' + parser = argparse.ArgumentParser(description=description, formatter_class=argparse.RawTextHelpFormatter) + group_output = parser.add_mutually_exclusive_group() + group_less = parser.add_mutually_exclusive_group() + parser.add_argument(dest='conferences_and_years', nargs='+', help='speficy conferences and years\n example1: acl 2019\n example2: acl naacl 2019\n example3: acl 2018 2019\n example4: acl naacl 2018 2019') + parser.add_argument('-q', '--quiet', help='be more quiet', action='store_true', dest='quiet') + parser.add_argument('--copy', help='copy result to clipboard', action='store_true', dest='copy') + parser.add_argument('-a', '--all', help='get also non-medical AI papers', action='store_true', dest='all') + group_output.add_argument('-m', '--md', '--markdown', help='output as markdown links\ncollaborates with --url-only\nignores --title-only\n', action='store_true', dest='markdown') + group_output.add_argument('--html', help='output as HTML tags\ncollaborates with --url-only\nignores --title-only\n', action='store_true', dest='html') + group_less.add_argument('--title-only', help='output paper title only', action='store_true', dest='title_only') + group_less.add_argument('--url-only', help='output paper URL only', action='store_true', dest='url_only') + args = parser.parse_args() + return args + + + +# throw HTTP request + +def search(conference, year, *config): + # + # conference: str or list + # + # for natural language processing conferences: + # ('acl', 'anlp', 'cl', 'conll', 'eacl', + # 'emnlp', 'naacl', 'semeval', 'tacl', + # 'ws', 'alta', 'coling', 'hlt', + # 'ijcnlp', 'jep-taln-recital', 'lrec', + # 'muc', 'paclic', 'ranlp', + # 'rocling-ijclclp', 'tinlap', 'tipster') + # + # for machine learning conferences: + # ('nips', 'icml', 'iclr', 'ijcnn', 'ijcai') + # + # for computer vision conferences: + # ('cvpr', 'iccv') + # + # year: str or int or list (1965 or greater) + # + # *config: argparse.Namespace object (optional) + # + # + # collections.OrderedDict {:} + + conferences = { 'NLP' : ['acl', 'anlp', 'cl', 'conll', 'eacl', 'emnlp', 'naacl',\ + 'semeval', 'tacl', 'ws', 'alta', 'coling', 'hlt',\ + 'ijcnlp', 'jep-taln-recital', 'lrec', 'muc', 'paclic', 'ranlp',\ + 'rocling-ijclclp', 'tinlap', 'tipster'],\ + 'ML' : ['nips', 'icml', 'iclr', 'ijcnn', 'ijcai'],\ + 'CV' : ['cvpr', 'iccv']} + + sources = {} + + for conf in conferences['NLP']: + sources[conf] = 'aclweb' + for conf in conferences['ML']: + sources[conf] = 'dblp' + for conf in conferences['CV']: + sources[conf] = 'dblp' + + url_container = { 'aclweb' : 'https://aclweb.org/anthology/events/{0}-{1}',\ + 'dblp' : 'https://dblp.org/db/conf/{0}/{0}{1}.html'} + + class Query(): + def __init__(self, conf, yr): + self.conference = conf + self.year = yr + self.config = None + self.res = None + self.url = None + self.source = None + self.config = None + + queries = [] + + if type(conference) is not list: + conference = [conference] + if type(year) is not list: + year = [year] + + for c in conference: + for y in year: + query = Query(c.lower(), str(y)) + query.config = config[0] + + # check conference name + try: + query.source = sources[query.conference] + query.url = url_container[query.source].format(query.conference, query.year) + queries.append(query) + except KeyError: + seps = '=' * 35 + print("Error: unavailable conference '{}'.".format(query.conference)) + print(seps) + print('Available conferences:') + print('\tML, AI:\n\t\t{}'.format(', '.join(conferences['ML']))) + print('\tCV:\n\t\t{}'.format(', '.join(conferences['CV']))) + print('\tNLP:\n\t\t{}'.format(', '.join(conferences['NLP']))) + print(seps) + + + # make connections + for q in queries: + print('Connecting for {} {} ...'.format(q.conference.upper(), q.year)) + try: + with urllib.request.urlopen(q.url) as res: + parse(res, q) + except urllib.error.HTTPError as err: + print('Error: {} {}'.format(err.code, err.reason)) + except urllib.error.URLError as err: + print('Error: {}'.format(err.reason)) + + + +class Article(): + def __init__(self, title='', author=[], abstract='', conference='', year=0, url=''): + self.title = title + self.author = author + self.abstract = abstract + self.conference = conference + self.year = year + self.url = url + + + +# process received HTTP response + +def parse(res, query): + selector = {'aclweb' : 'a[class="align-middle"]',\ + 'dblp' : 'span[class="title"]'} + + url_getter = {'aclweb' : lambda tag: 'https://aclweb.org' + tag.attrs['href'] if tag.attrs['href'].startswith('/anthology/paper') else None,\ + 'dblp' : lambda tag: tag.parent.parent.contents[2].ul.li.div.a['href']} + + prev_title = '' + n_total = 0 + articles = [] + + # get html content + html = res.read() + soup = bs4.BeautifulSoup(html, 'html5lib') + + # extract articles + for tag in soup.select(selector[query.source]): + skip = False + title = tag.getText() + if title != prev_title: + n_total += 1 + prev_title = title + if query.config.all or title_is_medical(title): + url = url_getter[query.source](tag) + if url is None: + continue + else: + article = Article(title=title, url=url, conference=query.conference, year=query.year) + articles.append(article) + + if not query.config.quiet: + sys.stdout.write('\rSearching... {} match / {}'.format(len(articles), n_total)) + sys.stdout.flush() + + + # prepare output display + output = '' + + if articles: + if query.config.markdown: + if query.config.url_only: + output = '\n'.join([ '[{0}]({0})'.format(article.url) for article in articles ]) + else: + output = '\n'.join([ '[{0}]({1})'.format(article.title, article.url) for article in articles ]) + elif query.config.html: + if query.config.url_only: + output = '
\n'.join([ '
{0}'.format(article.url) for article in articles ]) + else: + output = '
\n'.join([ '{0}'.format(article.title.replace('"', "'"), article.url) for article in articles ]) + else: + if query.config.title_only: + output = '\n'.join([ article.title for article in articles ]) + elif query.config.url_only: + output = '\n'.join([ article.url for article in articles ]) + else: + output = '\n\n'.join([ '{0}\n{1}'.format(article.title, article.url) for article in articles ]) + else: + output = 'No medical-like AI papers found.' + + + # display output + + seps = '=' * 35 + + if query.config.quiet: + if articles: + if not query.config.all: + print('Medical-like AI papers in {} {}: {} / {}'.format(query.conference.upper(), query.year, len(articles), n_total)) + else: + print('All papers in {} {}: {}'.format(query.conference.upper(), query.year, len(articles))) + else: + print(output) + else: + sys.stdout.write('\n') + if articles: + print(seps) + print(output) + print(seps) + if not query.config.all: + print('Medical-like AI papers in {} {}: {} / {}'.format(query.conference.upper(), query.year, len(articles), n_total)) + else: + print('All papers in {} {}: {}'.format(query.conference.upper(), query.year, len(articles))) + print(seps) + else: + print(output) + + + # copy onto clipboard if needed + if query.config.copy: + pyperclip.copy(output) + print(' * * * Copied this result to clipboard * * *') + + # return OrderedDict + return articles + + +if __name__ == '__main__': + config = get_args() + config.conferences = [] + config.years = [] + + for value in config.conferences_and_years: + try: + value = int(value) + config.years.append(value) + except ValueError: + config.conferences.append(value) + + search(config.conferences, config.years, config) From 06407e940418b544a7996f6f6c7b3fc3dec4f4a2 Mon Sep 17 00:00:00 2001 From: iBotamon Date: Sat, 7 Sep 2019 14:03:52 +0900 Subject: [PATCH 2/2] medical_ai.py -> medical/ai.py --- medical_ai.py | 283 -------------------------------------------------- 1 file changed, 283 deletions(-) delete mode 100644 medical_ai.py diff --git a/medical_ai.py b/medical_ai.py deleted file mode 100644 index 2a938df..0000000 --- a/medical_ai.py +++ /dev/null @@ -1,283 +0,0 @@ -# -*- coding: utf-8 -*- - -import argparse -import bs4 -import collections -import pyperclip -import sys -import urllib -import urllib.request - - -# keyword-based classification of medical/non-medical AI papers - -def title_is_medical(title): - keywords = ['medic', 'biomedic', 'bioMedic', 'health', 'clinic', 'EHR', 'MeSH', 'RCT', 'life', 'care', 'pharm', 'food-drug', 'drug', 'surg',\ - 'emergency', 'ICU', 'hospital', 'patient', 'doctor', 'disease', 'illness', 'symptom', 'treatment',\ - 'cancer', 'psycholog', 'psychiat', 'mental', 'radiol', 'patho', 'autopsy', 'x-ray', 'x-Ray', 'mammogr', 'CT', 'MRI', 'radiograph', 'tomograph',\ - 'magnetic'] - - for keyword in keywords: - for kw in (keyword, keyword.upper(), keyword.capitalize()): - if (((' ' + kw) in title) or title.startswith(kw)): - return True - else: - continue - - return False - - - -# get args when executed via command-line - -def get_args(): - description=''' -++++++++++++++++++++++++++++++++++++++++++++++++++ -Pickup medical AI paper titles and URLs from specified conference and year. -会議名と年数を指定すると, 医療に関連するAI論文のみを探し出してタイトルとURLを列挙します. - -To get from ACL 2019, input like this: python3 medical_ai.py acl 2019 -例えばACL 2019採択論文から探すには本プログラムを python3 medical medical_ai.py acl 2019 と実行してください. - -Conference name is case insensitive. -会議名は大文字でも小文字でも構いません. - -To output HTML link tags or markdown links, use options below. -以下に示すオプションを使うと, 結果をHTMLリンクタグやMarkdownリンクとして出力することも可能です. -++++++++++++++++++++++++++++++++++++++++++++++++++ - ''' - parser = argparse.ArgumentParser(description=description, formatter_class=argparse.RawTextHelpFormatter) - group_output = parser.add_mutually_exclusive_group() - group_less = parser.add_mutually_exclusive_group() - parser.add_argument(dest='conferences_and_years', nargs='+', help='speficy conferences and years\n example1: acl 2019\n example2: acl naacl 2019\n example3: acl 2018 2019\n example4: acl naacl 2018 2019') - parser.add_argument('-q', '--quiet', help='be more quiet', action='store_true', dest='quiet') - parser.add_argument('--copy', help='copy result to clipboard', action='store_true', dest='copy') - parser.add_argument('-a', '--all', help='get also non-medical AI papers', action='store_true', dest='all') - group_output.add_argument('-m', '--md', '--markdown', help='output as markdown links\ncollaborates with --url-only\nignores --title-only\n', action='store_true', dest='markdown') - group_output.add_argument('--html', help='output as HTML tags\ncollaborates with --url-only\nignores --title-only\n', action='store_true', dest='html') - group_less.add_argument('--title-only', help='output paper title only', action='store_true', dest='title_only') - group_less.add_argument('--url-only', help='output paper URL only', action='store_true', dest='url_only') - args = parser.parse_args() - return args - - - -# throw HTTP request - -def search(conference, year, *config): - # - # conference: str or list - # - # for natural language processing conferences: - # ('acl', 'anlp', 'cl', 'conll', 'eacl', - # 'emnlp', 'naacl', 'semeval', 'tacl', - # 'ws', 'alta', 'coling', 'hlt', - # 'ijcnlp', 'jep-taln-recital', 'lrec', - # 'muc', 'paclic', 'ranlp', - # 'rocling-ijclclp', 'tinlap', 'tipster') - # - # for machine learning conferences: - # ('nips', 'icml', 'iclr', 'ijcnn', 'ijcai') - # - # for computer vision conferences: - # ('cvpr', 'iccv') - # - # year: str or int or list (1965 or greater) - # - # *config: argparse.Namespace object (optional) - # - # - # collections.OrderedDict {:} - - conferences = { 'NLP' : ['acl', 'anlp', 'cl', 'conll', 'eacl', 'emnlp', 'naacl',\ - 'semeval', 'tacl', 'ws', 'alta', 'coling', 'hlt',\ - 'ijcnlp', 'jep-taln-recital', 'lrec', 'muc', 'paclic', 'ranlp',\ - 'rocling-ijclclp', 'tinlap', 'tipster'],\ - 'ML' : ['nips', 'icml', 'iclr', 'ijcnn', 'ijcai'],\ - 'CV' : ['cvpr', 'iccv']} - - sources = {} - - for conf in conferences['NLP']: - sources[conf] = 'aclweb' - for conf in conferences['ML']: - sources[conf] = 'dblp' - for conf in conferences['CV']: - sources[conf] = 'dblp' - - url_container = { 'aclweb' : 'https://aclweb.org/anthology/events/{0}-{1}',\ - 'dblp' : 'https://dblp.org/db/conf/{0}/{0}{1}.html'} - - class Query(): - def __init__(self, conf, yr): - self.conference = conf - self.year = yr - self.config = None - self.res = None - self.url = None - self.source = None - self.config = None - - queries = [] - - if type(conference) is not list: - conference = [conference] - if type(year) is not list: - year = [year] - - for c in conference: - for y in year: - query = Query(c.lower(), str(y)) - query.config = config[0] - - # check conference name - try: - query.source = sources[query.conference] - query.url = url_container[query.source].format(query.conference, query.year) - queries.append(query) - except KeyError: - seps = '=' * 35 - print("Error: unavailable conference '{}'.".format(query.conference)) - print(seps) - print('Available conferences:') - print('\tML, AI:\n\t\t{}'.format(', '.join(conferences['ML']))) - print('\tCV:\n\t\t{}'.format(', '.join(conferences['CV']))) - print('\tNLP:\n\t\t{}'.format(', '.join(conferences['NLP']))) - print(seps) - - - # make connections - for q in queries: - print('Connecting for {} {} ...'.format(q.conference.upper(), q.year)) - try: - with urllib.request.urlopen(q.url) as res: - parse(res, q) - except urllib.error.HTTPError as err: - print('Error: {} {}'.format(err.code, err.reason)) - except urllib.error.URLError as err: - print('Error: {}'.format(err.reason)) - - - -class Article(): - def __init__(self, title='', author=[], abstract='', conference='', year=0, url=''): - self.title = title - self.author = author - self.abstract = abstract - self.conference = conference - self.year = year - self.url = url - - - -# process received HTTP response - -def parse(res, query): - selector = {'aclweb' : 'a[class="align-middle"]',\ - 'dblp' : 'span[class="title"]'} - - url_getter = {'aclweb' : lambda tag: 'https://aclweb.org' + tag.attrs['href'] if tag.attrs['href'].startswith('/anthology/paper') else None,\ - 'dblp' : lambda tag: tag.parent.parent.contents[2].ul.li.div.a['href']} - - prev_title = '' - n_total = 0 - articles = [] - - # get html content - html = res.read() - soup = bs4.BeautifulSoup(html, 'html5lib') - - # extract articles - for tag in soup.select(selector[query.source]): - skip = False - title = tag.getText() - if title != prev_title: - n_total += 1 - prev_title = title - if query.config.all or title_is_medical(title): - url = url_getter[query.source](tag) - if url is None: - continue - else: - article = Article(title=title, url=url, conference=query.conference, year=query.year) - articles.append(article) - - if not query.config.quiet: - sys.stdout.write('\rSearching... {} match / {}'.format(len(articles), n_total)) - sys.stdout.flush() - - - # prepare output display - output = '' - - if articles: - if query.config.markdown: - if query.config.url_only: - output = '\n'.join([ '[{0}]({0})'.format(article.url) for article in articles ]) - else: - output = '\n'.join([ '[{0}]({1})'.format(article.title, article.url) for article in articles ]) - elif query.config.html: - if query.config.url_only: - output = '
\n'.join([ '
{0}'.format(article.url) for article in articles ]) - else: - output = '
\n'.join([ '{0}'.format(article.title.replace('"', "'"), article.url) for article in articles ]) - else: - if query.config.title_only: - output = '\n'.join([ article.title for article in articles ]) - elif query.config.url_only: - output = '\n'.join([ article.url for article in articles ]) - else: - output = '\n\n'.join([ '{0}\n{1}'.format(article.title, article.url) for article in articles ]) - else: - output = 'No medical-like AI papers found.' - - - # display output - - seps = '=' * 35 - - if query.config.quiet: - if articles: - if not query.config.all: - print('Medical-like AI papers in {} {}: {} / {}'.format(query.conference.upper(), query.year, len(articles), n_total)) - else: - print('All papers in {} {}: {}'.format(query.conference.upper(), query.year, len(articles))) - else: - print(output) - else: - sys.stdout.write('\n') - if articles: - print(seps) - print(output) - print(seps) - if not query.config.all: - print('Medical-like AI papers in {} {}: {} / {}'.format(query.conference.upper(), query.year, len(articles), n_total)) - else: - print('All papers in {} {}: {}'.format(query.conference.upper(), query.year, len(articles))) - print(seps) - else: - print(output) - - - # copy onto clipboard if needed - if query.config.copy: - pyperclip.copy(output) - print(' * * * Copied this result to clipboard * * *') - - # return OrderedDict - return articles - - -if __name__ == '__main__': - config = get_args() - config.conferences = [] - config.years = [] - - for value in config.conferences_and_years: - try: - value = int(value) - config.years.append(value) - except ValueError: - config.conferences.append(value) - - search(config.conferences, config.years, config)