diff --git a/fetch_apply.py b/fetch_apply.py index 28b5986..55b48b9 100644 --- a/fetch_apply.py +++ b/fetch_apply.py @@ -3,20 +3,25 @@ import os import sys import time +import argparse + +parser = argparse.ArgumentParser(description='fetch patent applications from USPTO bulk data') +parser.add_argument('--files', type=str, default='meta/apply_files.txt', help='list of application files to fetch') +parser.add_argument('--output', type=str, default='data/apply', help='directory to store fetched files') +parser.add_argument('--delay', type=int, default=10, help='number of seconds to wait between files') +parser.add_argument('--overwrite', action='store_true', help='overwrite existing files') +args = parser.parse_args() -apply_dir = 'data/apply' -apply_fpath = 'meta/apply_files.txt' apply_url_fmt = 'https://bulkdata.uspto.gov/data/patent/application/redbook/bibliographic/{}/{}' -overwrite = False -if not os.path.exists(apply_dir): - os.makedirs(apply_dir) +if not os.path.exists(args.output): + os.makedirs(args.output) url_list = [] -for line in open(apply_fpath): +for line in open(args.files): line = line.strip() - path = os.path.join(apply_dir, line) - if not overwrite and os.path.isfile(path): + path = os.path.join(args.output, line) + if not args.overwrite and os.path.isfile(path): continue if line.startswith('ipab'): @@ -34,7 +39,7 @@ print(f'Fetching {name}') os.system(f'curl -o {path} {url}') print() - time.sleep(10) + time.sleep(args.delay) # to extract: # cd data/apply diff --git a/fetch_assign.py b/fetch_assign.py index edb2d3c..6223cb9 100644 --- a/fetch_assign.py +++ b/fetch_assign.py @@ -2,20 +2,25 @@ import os import time +import argparse + +parser = argparse.ArgumentParser(description='fetch patent assignments from USPTO bulk data') +parser.add_argument('--files', type=str, default='meta/assign_files.txt', help='list of assignment files to fetch') +parser.add_argument('--output', type=str, default='data/assign', help='directory to store fetched files') +parser.add_argument('--delay', type=int, default=1, help='number of seconds to wait between files') +parser.add_argument('--overwrite', action='store_true', help='overwrite existing files') +args = parser.parse_args() -assign_dir = 'data/assign' -assign_fname = 'meta/assign_files.txt' assign_url_fmt = 'https://bulkdata.uspto.gov/data/patent/assignment/{}' -overwrite = False -if not os.path.exists(assign_dir): - os.mkdir(assign_dir) +if not os.path.exists(args.output): + os.mkdir(args.output) url_list = [] -for line in open(assign_fname): +for line in open(args.files): line = line.strip() - path = os.path.join(assign_dir, line) - if not overwrite and os.path.isfile(path): + path = os.path.join(args.output, line) + if not args.overwrite and os.path.isfile(path): continue year = int(line[2:6]) @@ -27,7 +32,7 @@ print(f'Fetching {name}') os.system(f'curl -o {path} {url}') print() - time.sleep(1) + time.sleep(args.delay) # extract files # cd data/assign diff --git a/fetch_grant.py b/fetch_grant.py index 938bfe1..d276ae5 100644 --- a/fetch_grant.py +++ b/fetch_grant.py @@ -2,20 +2,25 @@ import os import time +import argparse + +parser = argparse.ArgumentParser(description='fetch patent grants from USPTO bulk data') +parser.add_argument('--files', type=str, default='meta/grant_files.txt', help='list of grant files to fetch') +parser.add_argument('--output', type=str, default='data/grant', help='directory to store fetched files') +parser.add_argument('--delay', type=int, default=10, help='number of seconds to wait between files') +parser.add_argument('--overwrite', action='store_true', help='overwrite existing files') +args = parser.parse_args() -grant_dir = 'data/grant' -grant_fpath = 'meta/grant_files.txt' grant_url_fmt = 'https://bulkdata.uspto.gov/data/patent/grant/redbook/bibliographic/{}/{}' -overwrite = False -if not os.path.exists(grant_dir): - os.makedirs(grant_dir) +if not os.path.exists(args.output): + os.makedirs(args.output) url_list = [] -for line in open(grant_fpath): +for line in open(args.files): line = line.strip() - path = os.path.join(grant_dir, line) - if not overwrite and os.path.isfile(path): + path = os.path.join(args.output, line) + if not args.overwrite and os.path.isfile(path): continue if line.startswith('ipgb'): @@ -32,7 +37,7 @@ print(f'Fetching {name}') os.system(f'curl -o {path} {url}') print() - time.sleep(10) + time.sleep(args.delay) # to extract: # cd data/grant diff --git a/parse_apply.py b/parse_apply.py index f454146..5fa7c9c 100644 --- a/parse_apply.py +++ b/parse_apply.py @@ -3,13 +3,8 @@ import re import os -import sys import glob -import argparse -import sqlite3 -from lxml.etree import XMLPullParser from collections import defaultdict -from itertools import chain from traceback import print_exc from parse_tools import * @@ -64,7 +59,7 @@ def parse_apply_gen2(elem, fname): pat['abstract'] = raw_text(abst, sep=' ') # roll it in - return store_patent(pat) + return pat def parse_apply_gen3(elem, fname): pat = defaultdict(str) @@ -116,86 +111,59 @@ def parse_apply_gen3(elem, fname): pat['abstract'] = raw_text(abspar, sep=' ') # roll it in - return store_patent(pat) - -# parse input arguments -parser = argparse.ArgumentParser(description='patent application parser') -parser.add_argument('target', type=str, nargs='*', help='path or directory of file(s) to parse') -parser.add_argument('--db', type=str, default=None, help='database file to store to') -parser.add_argument('--output', type=int, default=None, help='how often to output summary') -parser.add_argument('--limit', type=int, default=None, help='only parse n patents') -args = parser.parse_args() + return pat # table schema -schema = { - 'appnum': 'text', # Patent number - 'appdate': 'text', # Application date - 'appname': 'text', # Assignee name - 'pubnum': 'text', # Publication number - 'pubdate': 'text', # Publication date - 'ipc': 'text', # Main IPC code - 'ipcver': 'text', # IPC version info - 'city': 'text', # Assignee city - 'state': 'text', # State code - 'country': 'text', # Assignee country - 'title': 'text', # Title - 'abstract': 'text', # Abstract +schema_apply = { + 'appnum': 'str', # Patent number + 'appdate': 'str', # Application date + 'appname': 'str', # Assignee name + 'pubnum': 'str', # Publication number + 'pubdate': 'str', # Publication date + 'ipc': 'str', # Main IPC code + 'ipcver': 'str', # IPC version info + 'city': 'str', # Assignee city + 'state': 'str', # State code + 'country': 'str', # Assignee country + 'title': 'str', # Title + 'abstract': 'str', # Abstract 'gen': 'int', # USPTO data format - 'file': 'text', # path to source file + 'file': 'str', # path to source file } -tabsig = ', '.join([f'{k} {v}' for k, v in schema.items()]) - -# database setup -if args.db is not None: - con = sqlite3.connect(args.db) - cur = con.cursor() - cur.execute(f'CREATE TABLE IF NOT EXISTS apply ({tabsig})') - cur.execute('CREATE UNIQUE INDEX IF NOT EXISTS apply_appnum ON apply (appnum)') - cur.execute('CREATE TABLE IF NOT EXISTS ipc_apply (appnum text, ipc text, rank int, ver text)') - pat_chunker = ChunkInserter(con, table='apply') - ipc_chunker = ChunkInserter(con, table='ipc_apply') -else: - pat_chunker = DummyInserter() - ipc_chunker = DummyInserter() -# chunking express -i = 0 -def store_patent(pat): - global i - i += 1 +schema_ipc = { + 'patnum': 'str', # Patent number + 'ipc': 'str', # IPC code + 'rank': 'int', # Order listed + 'version': 'str' # IPC version +} +# chunking express +def store_patent(pat, chunker_pat, chunker_ipc): an, iv = pat['appnum'], pat['ipcver'] # store ipcs for j, ipc in enumerate(pat['ipcs']): if j == 0: pat['ipc'] = ipc - ipc_chunker.insert(an, ipc, j, iv) + chunker_ipc.insert(an, ipc, j, iv) # store patent - pat_chunker.insert(*(pat[k] for k in schema)) - - # output - if args.output is not None and i % args.output == 0: - print('an = {appnum:10.10s}, fd = {appdate:10.10s}, ti = {title:30.30s}, on = {appname:30.30s}, ci = {city:15.15s}, st = {state:2s}, ct = {country:2s}'.format(**{k: pat.get(k, '') for k in schema})) + chunker_pat.insert(*(pat.get(k, '') for k in schema_apply)) - # limit - if args.limit is not None and i >= args.limit: - print("Reached limit.") - return False - else: - return True - -# collect files -if len(args.target) == 0 or (len(args.target) == 1 and os.path.isdir(args.target[0])): - targ_dir = 'data/apply' if len(args.target) == 0 else args.target[0] - file_list = sorted(glob.glob(f'{targ_dir}/pab*.xml')) + sorted(glob.glob(f'{targ_dir}/ipab*.xml')) -else: - file_list = args.target - -# parse by generation -for fpath in file_list: - # detect generation +# file level +def parse_file(fpath, output, overwrite=False, dryrun=False, display=0): fdir, fname = os.path.split(fpath) + ftag, fext = os.path.splitext(fname) + + opath = os.path.join(output, ftag) + opath_apply = f'{opath}_apply.csv' + opath_ipc = f'{opath}_ipc.csv' + + if not overwrite: + if os.path.exists(opath_apply) and os.path.exists(opath_ipc): + print(f'{ftag}: Skipping') + return + if fname.startswith('pab'): gen = 2 main_tag = 'patent-application-publication' @@ -204,22 +172,72 @@ def store_patent(pat): gen = 3 parser = lambda fp: parse_wrapper(fp, 'us-patent-application', parse_apply_gen3) else: - raise Exception('Unknown format') + raise Exception(f'{ftag}: Unknown format') + + if not dryrun: + chunker_apply = ChunkWriter(opath_apply, schema=schema_apply) + chunker_ipc = ChunkWriter(opath_ipc, schema=schema_ipc) + else: + chunker_apply = DummyWriter() + chunker_ipc = DummyWriter() # parse it up - print(f'Parsing {fname}, gen {gen}') - i0 = i try: - parser(fpath) + print(f'{ftag}: Starting') + + i = 0 + for pat in parser(fpath): + i += 1 + + store_patent(pat, chunker_apply, chunker_ipc) + + # output + if display > 0 and i % display == 0: + spat = {k: pat.get(k, '') for k in schema_apply} + print('an = {appnum:10.10s}, fd = {appdate:10.10s}, ti = {title:30.30s}, on = {appname:30.30s}, ci = {city:15.15s}, st = {state:2s}, ct = {country:2s}'.format(**spat)) + + # commit to db and close + chunker_apply.commit() + chunker_ipc.commit() + + print(f'{ftag}: Parsed {i} patents') except Exception as e: - print('EXCEPTION OCCURRED!') + print(f'{ftag}: EXCEPTION OCCURRED!') print_exc() - print(f'Found {i-i0} patents, {i} total') - print() -# commit to db and close -pat_chunker.commit() -ipc_chunker.commit() + chunker_apply.delete() + chunker_ipc.delete() + +if __name__ == '__main__': + import argparse + from multiprocessing import Pool + + # parse input arguments + parser = argparse.ArgumentParser(description='patent application parser') + parser.add_argument('target', type=str, nargs='*', help='path or directory of file(s) to parse') + parser.add_argument('--output', type=str, default='parsed/apply', help='directory to output to') + parser.add_argument('--display', type=int, default=1000, help='how often to display summary') + parser.add_argument('--dryrun', action='store_true', help='do not actually store') + parser.add_argument('--overwrite', action='store_true', help='clobber existing files') + parser.add_argument('--cores', type=int, default=10, help='number of cores to use') + args = parser.parse_args() + + # collect files + if len(args.target) == 0 or (len(args.target) == 1 and os.path.isdir(args.target[0])): + targ_dir = 'data/apply' if len(args.target) == 0 else args.target[0] + file_list = sorted(glob.glob(f'{targ_dir}/pab*.xml')) + sorted(glob.glob(f'{targ_dir}/ipab*.xml')) + else: + file_list = args.target + + # ensure output dir + if not os.path.exists(args.output): + os.makedirs(args.output) + + # apply options + opts = dict(overwrite=args.overwrite, dryrun=args.dryrun, display=args.display) + def parse_file_opts(fpath): + parse_file(fpath, args.output, **opts) -if args.db is not None: - con.close() + # parse files + with Pool(args.cores) as pool: + pool.map(parse_file_opts, file_list, chunksize=1) diff --git a/parse_assign.py b/parse_assign.py index bb0b0cd..6ea174b 100644 --- a/parse_assign.py +++ b/parse_assign.py @@ -1,13 +1,13 @@ +#!/usr/bin/env python3 +# coding: UTF-8 + import re import os -import sys import glob -import sqlite3 -import argparse +from collections import defaultdict +from traceback import print_exc from lxml.etree import iterparse from parse_tools import * -from traceback import print_exc -from collections import defaultdict # parse assignment def parse_assign_gen3(elem, fname): @@ -40,104 +40,117 @@ def parse_assign_gen3(elem, fname): # patent info pat['patnums'] = [prune_patnum(pn) for pn in gen3_assign(patents)] - return store_patent(pat) + return pat # parse file def parse_file_gen3(fpath): _, fname = os.path.split(fpath) for event, elem in iterparse(fpath, tag='patent-assignment', events=['end'], recover=True): - if not parse_assign_gen3(elem, fname): - return False + yield parse_assign_gen3(elem, fname) clear(elem) - return True - -# parse input arguments -parser = argparse.ArgumentParser(description='USPTO patent assignment parser.') -parser.add_argument('target', type=str, nargs='*', help='path or directory of file(s) to parse') -parser.add_argument('--db', type=str, default=None, help='database file to store to') -parser.add_argument('--output', type=int, default=None, help='how often to output summary') -parser.add_argument('--limit', type=int, default=None, help='only parse n patents') -args = parser.parse_args() # table schema -schema = { - 'assignid': 'integer primary key', # unique assignment id - 'patnum': 'text', # Patent number - 'execdate': 'text', # Execution date - 'recdate': 'text', # Reception date - 'conveyance': 'text', # Conveyance description - 'assignor': 'text', # Assignor name - 'assignee': 'text', # Assignee name - 'assignee_state': 'text', # State code - 'assignee_country': 'text', # Assignee country +schema_assign = { + 'assignid': 'int', # unique assignment id + 'patnum': 'str', # Patent number + 'execdate': 'str', # Execution date + 'recdate': 'str', # Reception date + 'conveyance': 'str', # Conveyance description + 'assignor': 'str', # Assignor name + 'assignee': 'str', # Assignee name + 'assignee_state': 'str', # State code + 'assignee_country': 'str', # Assignee country 'gen': 'int', # USPTO data format - 'file': 'text', # path to source file + 'file': 'str', # path to source file } -tabsig = ', '.join([f'{k} {v}' for k, v in schema.items()]) - -# connect to patent db -if args.db is not None: - con = sqlite3.connect(args.db) - cur = con.cursor() - cur.execute(f'CREATE TABLE IF NOT EXISTS assign ({tabsig})') - chunker = ChunkInserter(con, table='assign') -else: - chunker = DummyInserter() # chunking express -i, p = 0, 0 -def store_patent(pat): - global i, p - i += 1 - +def store_patent(pat, chunker_assign): # filter out individuals and non-transfers pat['assignor_type'] = org_type(pat['assignor']) pat['assignee_type'] = org_type(pat['assignee']) pat['convey_type'] = convey_type(pat['conveyance']) if pat['assignor_type'] == ORG_INDV or pat['assignee_type'] == ORG_INDV or pat['convey_type'] == CONV_OTHER: - return True - p += 1 + return # store assign for pn in pat['patnums']: pat['patnum'] = pn - chunker.insert(*(pat[k] for k in schema)) + chunker_assign.insert(*(pat[k] for k in schema_assign)) - # logging - if args.output is not None and p % args.output == 0: - pat['npat'] = len(pat['patnums']) - print('[{npat:4d}]: {assignor:40.40s} [{assignor_type:1d}] -> {assignee:30.30s} [{assignee_type:1d}] ({recdate:8.8s}, {assignee_country:20.20s})'.format(**pat)) +# file level +def parse_file(fpath, output, overwrite=False, dryrun=False, display=0): + fdir, fname = os.path.split(fpath) + ftag, fext = os.path.splitext(fname) - # break - if args.limit is not None and p >= args.limit: - return False + opath = os.path.join(output, ftag) + opath_assign = f'{opath}_assign.csv' - return True + if not overwrite: + if os.path.exists(opath_assign): + print(f'{ftag}: Skipping') + return -# collect files -if len(args.target) == 0 or (len(args.target) == 1 and os.path.isdir(args.target[0])): - targ_dir = 'data/assign' if len(args.target) == 0 else args.target[0] - file_list = sorted(glob.glob(f'{targ_dir}/*.xml')) -else: - file_list = args.target - -# do robust parsing -for fpath in file_list: - print(f'Parsing {fpath}') - i0, p0 = i, p + if not dryrun: + chunker_assign = ChunkWriter(opath_assign, schema=schema_assign) + else: + chunker_assign = DummyWriter() + # parse it up try: - parse_file_gen3(fpath) - except Exception as e: - print('EXCEPTION OCCURRED!') - print_exc() + print(f'{ftag}: Starting') + + i = 0 + for pat in parse_file_gen3(fpath): + i += 1 - print(f'Found {i-i0} records, {i} total') - print(f'Found {p-p0} transfers, {p} total') - print() + store_patent(pat, chunker_assign) -# clear out the rest -chunker.commit() + # output + if display > 0 and i % display == 0: + pat['npat'] = len(pat['patnums']) + print('[{npat:4d}]: {assignor:40.40s} [{assignor_type:1d}] -> {assignee:30.30s} [{assignee_type:1d}] ({recdate:8.8s}, {assignee_country:20.20s})'.format(**pat)) + + print(f'{ftag}: Parsed {i} records') + + # clear out the rest + chunker_assign.commit() + except Exception as e: + print(f'{ftag}: EXCEPTION OCCURRED!') + print_exc() -if args.db is not None: - con.close() + chunker_assign.delete() + +if __name__ == '__main__': + import argparse + from multiprocessing import Pool + + # parse input arguments + parser = argparse.ArgumentParser(description='patent application parser') + parser.add_argument('target', type=str, nargs='*', help='path or directory of file(s) to parse') + parser.add_argument('--output', type=str, default='parsed/assign', help='directory to output to') + parser.add_argument('--display', type=int, default=1000, help='how often to display summary') + parser.add_argument('--dryrun', action='store_true', help='do not actually store') + parser.add_argument('--overwrite', action='store_true', help='clobber existing files') + parser.add_argument('--cores', type=int, default=10, help='number of cores to use') + args = parser.parse_args() + + # collect files + if len(args.target) == 0 or (len(args.target) == 1 and os.path.isdir(args.target[0])): + targ_dir = 'data/assign' if len(args.target) == 0 else args.target[0] + file_list = sorted(glob.glob(f'{targ_dir}/*.xml')) + else: + file_list = args.target + + # ensure output dir + if not os.path.exists(args.output): + os.makedirs(args.output) + + # apply options + opts = dict(overwrite=args.overwrite, dryrun=args.dryrun, display=args.display) + def parse_file_opts(fpath): + parse_file(fpath, args.output, **opts) + + # parse files + with Pool(args.cores) as pool: + pool.map(parse_file_opts, file_list, chunksize=1) diff --git a/parse_grant.py b/parse_grant.py index 018e25e..adb331b 100644 --- a/parse_grant.py +++ b/parse_grant.py @@ -3,13 +3,10 @@ import re import os -import sys import glob -import argparse -import sqlite3 from collections import defaultdict -from itertools import chain from traceback import print_exc +from itertools import chain from parse_tools import * # parse it up @@ -33,12 +30,11 @@ def parse_grant_gen1(fname): if tag == 'PATN': if pat is not None: pat['appnum'] = src + apn - if not store_patent(pat): - break + yield pat pat = defaultdict(str) sec = 'PATN' pat['gen'] = 1 - _, pat['file'] = os.path.split(fpath) + _, pat['file'] = os.path.split(fname) pat['ipcs'] = [] pat['cites'] = [] src, apn = '', '' @@ -150,7 +146,7 @@ def parse_grant_gen2(elem, fname): pat['abstract'] = '\n'.join([raw_text(e) for e in abspars]) # roll it in - return store_patent(pat) + return pat def parse_grant_gen3(elem, fname): pat = defaultdict(str) @@ -217,94 +213,80 @@ def parse_grant_gen3(elem, fname): pat['abstract'] = raw_text(abspar, sep=' ') # roll it in - return store_patent(pat) - -# parse input arguments -parser = argparse.ArgumentParser(description='patent grant parser.') -parser.add_argument('target', type=str, nargs='*', help='path or directory of file(s) to parse') -parser.add_argument('--db', type=str, default=None, help='database file to store to') -parser.add_argument('--output', type=int, default=None, help='how often to output summary') -parser.add_argument('--limit', type=int, default=None, help='only parse n patents') -args = parser.parse_args() - -# table schema -schema = { - 'patnum': 'text', # Patent number - 'pubdate': 'text', # Publication date - 'appnum': 'text', # Application number - 'appdate': 'text', # Publication date - 'ipc': 'text', # Main IPC code - 'ipcver': 'text', # IPC version info - 'city': 'text', # Assignee city - 'state': 'text', # State code - 'country': 'text', # Assignee country - 'owner': 'text', # Assignee name + return pat + +# table schemas +schema_grant = { + 'patnum': 'str', # Patent number + 'pubdate': 'str', # Publication date + 'appnum': 'str', # Application number + 'appdate': 'str', # Publication date + 'ipc': 'str', # Main IPC code + 'ipcver': 'str', # IPC version info + 'city': 'str', # Assignee city + 'state': 'str', # State code + 'country': 'str', # Assignee country + 'owner': 'str', # Assignee name 'claims': 'int', # Independent claim - 'title': 'text', # Title - 'abstract': 'text', # Abstract + 'title': 'str', # Title + 'abstract': 'str', # Abstract 'gen': 'int', # USPTO data format - 'file': 'text', # path to source file + 'file': 'str', # path to source file } -tabsig = ', '.join([f'{k} {v}' for k, v in schema.items()]) - -# database setup -if args.db is not None: - con = sqlite3.connect(args.db) - cur = con.cursor() - cur.execute(f'CREATE TABLE IF NOT EXISTS grant ({tabsig})') - cur.execute('CREATE UNIQUE INDEX IF NOT EXISTS grant_patnum ON grant (patnum)') - cur.execute('CREATE UNIQUE INDEX IF NOT EXISTS grant_appnum ON grant (appnum)') - cur.execute('CREATE TABLE IF NOT EXISTS ipc_grant (patnum text, ipc text, rank int, ver text)') - cur.execute('CREATE TABLE IF NOT EXISTS cite (src text, dst text)') - pat_chunker = ChunkInserter(con, table='grant') - ipc_chunker = ChunkInserter(con, table='ipc_grant') - cit_chunker = ChunkInserter(con, table='cite') -else: - pat_chunker = DummyInserter() - ipc_chunker = DummyInserter() - cit_chunker = DummyInserter() - -# global adder -i = 0 -def store_patent(pat): - global i - i += 1 +schema_ipc = { + 'patnum': 'str', # Patent number + 'ipc': 'str', # IPC code + 'rank': 'int', # Order listed + 'version': 'str' # IPC version +} + +schema_cite = { + 'src': 'str', # Source patent (citer) + 'dst': 'str' # Destination patent (citee) +} + +# patent adder +def store_patent(pat, chunker_grant, chunker_ipc, chunker_cite): pn, iv = pat['patnum'], pat['ipcver'] # store cites for cite in pat['cites']: - cit_chunker.insert(pn, cite) + chunker_cite.insert(pn, cite) # store ipcs for j, ipc in enumerate(pat['ipcs']): - if j == 0: pat['ipc'] = ipc - ipc_chunker.insert(pn, ipc, j, iv) + if j == 0: + pat['ipc'] = ipc + chunker_ipc.insert(pn, ipc, j, iv) # store patent - pat_chunker.insert(*(pat.get(k, None) for k in schema)) - - # output - if args.output is not None and i % args.output == 0: - print('pn = {patnum:10.10s}, pd = {pubdate:10.10s}, ti = {title:30.30s}, on = {owner:30.30s}, ci = {city:15.15s}, st = {state:2s}, ct = {country:2s}'.format(**{k: pat.get(k, '') for k in schema})) + chunker_grant.insert(*(pat.get(k, '') for k in schema_grant)) - # limit - if args.limit is not None and i >= args.limit: - print("Reached limit.") - return False +# file level +def parse_file(fpath, output, overwrite=False, dryrun=False, display=0): + fdir, fname = os.path.split(fpath) + ftag, fext = os.path.splitext(fname) + + opath = os.path.join(output, ftag) + opath_grant = f'{opath}_grant.csv' + opath_ipc = f'{opath}_ipc.csv' + opath_cite = f'{opath}_cite.csv' + + if not overwrite: + if os.path.exists(opath_grant) and os.path.exists(opath_ipc) and os.path.exists(opath_cite): + print(f'{ftag}: Skipping') + return + + if not dryrun: + chunker_grant = ChunkWriter(opath_grant, schema=schema_grant) + chunker_ipc = ChunkWriter(opath_ipc, schema=schema_ipc) + chunker_cite = ChunkWriter(opath_cite, schema=schema_cite) else: - return True - -# collect files -if len(args.target) == 0 or (len(args.target) == 1 and os.path.isdir(args.target[0])): - targ_dir = 'data/grant' if len(args.target) == 0 else args.target[0] - file_list = sorted(glob.glob(f'{targ_dir}/*.dat')) + sorted(glob.glob(f'{targ_dir}/pgb*.xml')) + sorted(glob.glob(f'{targ_dir}/ipgb*.xml')) -else: - file_list = args.target + chunker_grant = DummyWriter() + chunker_ipc = DummyWriter() + chunker_cite = DummyWriter() -# parse by generation -for fpath in file_list: - fdir, fname = os.path.split(fpath) if fname.endswith('.dat'): gen = 1 parser = parse_grant_gen1 @@ -315,22 +297,68 @@ def store_patent(pat): gen = 3 parser = lambda fp: parse_wrapper(fp, 'us-patent-grant', parse_grant_gen3) else: - print(f'Unknown format: {fname}') + print(f'{ftag}: Unknown format') - print(f'Parsing {fname}, gen = {gen}') - i0 = i + # parse it up try: - parser(fpath) + print(f'{ftag}: Starting') + + i = 0 + for pat in parser(fpath): + i += 1 + + # store all info + store_patent(pat, chunker_grant, chunker_ipc, chunker_cite) + + # output if needed + if display > 0 and i % display == 0: + spat = {k: pat.get(k, '') for k in schema_grant} + print('pn = {patnum:10.10s}, pd = {pubdate:10.10s}, ti = {title:30.30s}, on = {owner:30.30s}, ci = {city:15.15s}, st = {state:2s}, ct = {country:2s}'.format(**spat)) + + # commit remaining + chunker_grant.commit() + chunker_ipc.commit() + chunker_cite.commit() + + print(f'{ftag}: Parsed {i} patents') except Exception as e: - print('EXCEPTION OCCURRED!') + print(f'{ftag}: EXCEPTION OCCURRED!') print_exc() - print(f'Found {i-i0} patents, {i} total') - print() -# commit to db and close -pat_chunker.commit() -ipc_chunker.commit() -cit_chunker.commit() + chunker_grant.delete() + chunker_ipc.delete() + chunker_cite.delete() + +if __name__ == '__main__': + import argparse + from multiprocessing import Pool + + # parse input arguments + parser = argparse.ArgumentParser(description='patent grant parser.') + parser.add_argument('target', type=str, nargs='*', help='path or directory of file(s) to parse') + parser.add_argument('--output', type=str, default='parsed/grant', help='directory to output to') + parser.add_argument('--display', type=int, default=1000, help='how often to display summary') + parser.add_argument('--dryrun', action='store_true', help='do not actually store') + parser.add_argument('--overwrite', action='store_true', help='clobber existing files') + parser.add_argument('--cores', type=int, default=10, help='number of cores to use') + args = parser.parse_args() + + # collect files + if len(args.target) == 0 or (len(args.target) == 1 and os.path.isdir(args.target[0])): + targ_dir = 'data/grant' if len(args.target) == 0 else args.target[0] + file_list = sorted(glob.glob(f'{targ_dir}/*.dat')) + sorted(glob.glob(f'{targ_dir}/pgb*.xml')) + sorted(glob.glob(f'{targ_dir}/ipgb*.xml')) + else: + file_list = args.target + + # ensure output dir + if not os.path.exists(args.output): + os.makedirs(args.output) + + # apply options + opts = dict(overwrite=args.overwrite, dryrun=args.dryrun, display=args.display) + def parse_file_opts(fpath): + parse_file(fpath, args.output, **opts) -if args.db is not None: - con.close() + # parse files + with Pool(args.cores) as pool: + pool.map(parse_file_opts, file_list, chunksize=1) diff --git a/parse_tools.py b/parse_tools.py index 3da625f..bf37348 100644 --- a/parse_tools.py +++ b/parse_tools.py @@ -4,6 +4,8 @@ import os import re +import numpy as np +import pandas as pd from lxml.etree import XMLPullParser ## @@ -36,46 +38,52 @@ def parse_wrapper(fpath, main_tag, parser): pp = XMLPullParser(tag=main_tag, events=['end'], recover=True) def parse_all(): for _, pat in pp.read_events(): - if not parser(pat, fname): - return False + yield parser(pat, fname) clear(pat) - return True with open(fpath, errors='ignore') as f: pp.feed('\n') for line in f: if line.startswith(''): pass else: pp.feed(line) else: pp.feed('\n') - return parse_all() + yield from parse_all() ## -## database interface +## csv interface ## +def astype(data, dtype): + if dtype == 'str': + return pd.Series(data, dtype='str') + elif dtype == 'int': + return pd.to_numeric(pd.Series(data), errors='coerce').astype('Int64') + else: + raise Exception(f'Unsupported type: {dtype}') + # insert in chunks -class ChunkInserter: - def __init__(self, con, table=None, cmd=None, cur=None, chunk_size=1000, nargs=None, output=False): - if table is None and cmd is None: - raise('Must specify either table or cmd') - - self.con = con - self.cur = cur if cur is not None else con.cursor() - self.table = table - self.cmd = cmd +class ChunkWriter: + def __init__(self, path, schema, chunk_size=1000, output=False): + self.path = path + self.schema = schema self.chunk_size = chunk_size - self.nargs = nargs self.output = output self.items = [] self.i = 0 self.j = 0 + self.file = open(self.path, 'w+') + header = ','.join(schema) + self.file.write(f'{header}\n') + + def __del__(self): + self.file.close() + def insert(self, *args): self.items.append(args) if len(self.items) >= self.chunk_size: @@ -95,28 +103,32 @@ def insertmany(self, args): def commit(self): self.i += 1 self.j += len(self.items) + if len(self.items) == 0: return - if self.cmd is None: - if self.nargs is None: - self.nargs = len(self.items[0]) - sign = ','.join(self.nargs*'?') - self.cmd = f'insert or replace into {self.table} values ({sign})' + if self.output: print(f'Committing chunk {self.i} to {self.table} ({len(self.items)})') - self.cur.executemany(self.cmd, self.items) - self.con.commit() - self.items = [] + + data = [x for x in zip(*self.items)] + frame = pd.DataFrame({k: astype(d, v) for (k, v), d in zip(self.schema.items(), data)}) + frame.to_csv(self.file, index=False, header=False) + + self.items.clear() + + def delete(self): + self.file.close() + os.remove(self.path) # pretend to insert in chunks -class DummyInserter: +class DummyWriter: def __init__(self, *args, chunk_size=1000, output=False, **kwargs): self.chunk_size = chunk_size self.output = output self.last = None self.i = 0 - def insert(self,*args): + def insert(self, *args): self.last = args self.i += 1 if self.i >= self.chunk_size: @@ -125,7 +137,7 @@ def insert(self,*args): else: return False - def insertmany(self,args): + def insertmany(self, args): if len(args) > 0: self.last = args[-1] self.i += len(args) @@ -140,6 +152,9 @@ def commit(self): print(self.last) self.i = 0 + def delete(self): + pass + ## ## patnum parsers ## @@ -152,7 +167,7 @@ def prune_patnum(pn, maxlen=7): patnum = pn else: prefix, patnum = ret.groups() - prefix = '' if (prefix is None or prefix is '0') else prefix + prefix = '' if (prefix is None or prefix == '0') else prefix patnum = patnum[:maxlen].lstrip('0') return prefix + patnum