Skip to content

Commit

Permalink
parallel parse
Browse files Browse the repository at this point in the history
  • Loading branch information
iamlemec committed Oct 21, 2019
1 parent 3e663be commit 5980194
Show file tree
Hide file tree
Showing 7 changed files with 397 additions and 308 deletions.
23 changes: 14 additions & 9 deletions fetch_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,25 @@
import os
import sys
import time
import argparse

parser = argparse.ArgumentParser(description='fetch patent applications from USPTO bulk data')
parser.add_argument('--files', type=str, default='meta/apply_files.txt', help='list of application files to fetch')
parser.add_argument('--output', type=str, default='data/apply', help='directory to store fetched files')
parser.add_argument('--delay', type=int, default=10, help='number of seconds to wait between files')
parser.add_argument('--overwrite', action='store_true', help='overwrite existing files')
args = parser.parse_args()

apply_dir = 'data/apply'
apply_fpath = 'meta/apply_files.txt'
apply_url_fmt = 'https://bulkdata.uspto.gov/data/patent/application/redbook/bibliographic/{}/{}'
overwrite = False

if not os.path.exists(apply_dir):
os.makedirs(apply_dir)
if not os.path.exists(args.output):
os.makedirs(args.output)

url_list = []
for line in open(apply_fpath):
for line in open(args.files):
line = line.strip()
path = os.path.join(apply_dir, line)
if not overwrite and os.path.isfile(path):
path = os.path.join(args.output, line)
if not args.overwrite and os.path.isfile(path):
continue

if line.startswith('ipab'):
Expand All @@ -34,7 +39,7 @@
print(f'Fetching {name}')
os.system(f'curl -o {path} {url}')
print()
time.sleep(10)
time.sleep(args.delay)

# to extract:
# cd data/apply
Expand Down
23 changes: 14 additions & 9 deletions fetch_assign.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,25 @@

import os
import time
import argparse

parser = argparse.ArgumentParser(description='fetch patent assignments from USPTO bulk data')
parser.add_argument('--files', type=str, default='meta/assign_files.txt', help='list of assignment files to fetch')
parser.add_argument('--output', type=str, default='data/assign', help='directory to store fetched files')
parser.add_argument('--delay', type=int, default=1, help='number of seconds to wait between files')
parser.add_argument('--overwrite', action='store_true', help='overwrite existing files')
args = parser.parse_args()

assign_dir = 'data/assign'
assign_fname = 'meta/assign_files.txt'
assign_url_fmt = 'https://bulkdata.uspto.gov/data/patent/assignment/{}'
overwrite = False

if not os.path.exists(assign_dir):
os.mkdir(assign_dir)
if not os.path.exists(args.output):
os.mkdir(args.output)

url_list = []
for line in open(assign_fname):
for line in open(args.files):
line = line.strip()
path = os.path.join(assign_dir, line)
if not overwrite and os.path.isfile(path):
path = os.path.join(args.output, line)
if not args.overwrite and os.path.isfile(path):
continue

year = int(line[2:6])
Expand All @@ -27,7 +32,7 @@
print(f'Fetching {name}')
os.system(f'curl -o {path} {url}')
print()
time.sleep(1)
time.sleep(args.delay)

# extract files
# cd data/assign
Expand Down
23 changes: 14 additions & 9 deletions fetch_grant.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,25 @@

import os
import time
import argparse

parser = argparse.ArgumentParser(description='fetch patent grants from USPTO bulk data')
parser.add_argument('--files', type=str, default='meta/grant_files.txt', help='list of grant files to fetch')
parser.add_argument('--output', type=str, default='data/grant', help='directory to store fetched files')
parser.add_argument('--delay', type=int, default=10, help='number of seconds to wait between files')
parser.add_argument('--overwrite', action='store_true', help='overwrite existing files')
args = parser.parse_args()

grant_dir = 'data/grant'
grant_fpath = 'meta/grant_files.txt'
grant_url_fmt = 'https://bulkdata.uspto.gov/data/patent/grant/redbook/bibliographic/{}/{}'
overwrite = False

if not os.path.exists(grant_dir):
os.makedirs(grant_dir)
if not os.path.exists(args.output):
os.makedirs(args.output)

url_list = []
for line in open(grant_fpath):
for line in open(args.files):
line = line.strip()
path = os.path.join(grant_dir, line)
if not overwrite and os.path.isfile(path):
path = os.path.join(args.output, line)
if not args.overwrite and os.path.isfile(path):
continue

if line.startswith('ipgb'):
Expand All @@ -32,7 +37,7 @@
print(f'Fetching {name}')
os.system(f'curl -o {path} {url}')
print()
time.sleep(10)
time.sleep(args.delay)

# to extract:
# cd data/grant
Expand Down
184 changes: 101 additions & 83 deletions parse_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,8 @@

import re
import os
import sys
import glob
import argparse
import sqlite3
from lxml.etree import XMLPullParser
from collections import defaultdict
from itertools import chain
from traceback import print_exc
from parse_tools import *

Expand Down Expand Up @@ -64,7 +59,7 @@ def parse_apply_gen2(elem, fname):
pat['abstract'] = raw_text(abst, sep=' ')

# roll it in
return store_patent(pat)
return pat

def parse_apply_gen3(elem, fname):
pat = defaultdict(str)
Expand Down Expand Up @@ -116,86 +111,59 @@ def parse_apply_gen3(elem, fname):
pat['abstract'] = raw_text(abspar, sep=' ')

# roll it in
return store_patent(pat)

# parse input arguments
parser = argparse.ArgumentParser(description='patent application parser')
parser.add_argument('target', type=str, nargs='*', help='path or directory of file(s) to parse')
parser.add_argument('--db', type=str, default=None, help='database file to store to')
parser.add_argument('--output', type=int, default=None, help='how often to output summary')
parser.add_argument('--limit', type=int, default=None, help='only parse n patents')
args = parser.parse_args()
return pat

# table schema
schema = {
'appnum': 'text', # Patent number
'appdate': 'text', # Application date
'appname': 'text', # Assignee name
'pubnum': 'text', # Publication number
'pubdate': 'text', # Publication date
'ipc': 'text', # Main IPC code
'ipcver': 'text', # IPC version info
'city': 'text', # Assignee city
'state': 'text', # State code
'country': 'text', # Assignee country
'title': 'text', # Title
'abstract': 'text', # Abstract
schema_apply = {
'appnum': 'str', # Patent number
'appdate': 'str', # Application date
'appname': 'str', # Assignee name
'pubnum': 'str', # Publication number
'pubdate': 'str', # Publication date
'ipc': 'str', # Main IPC code
'ipcver': 'str', # IPC version info
'city': 'str', # Assignee city
'state': 'str', # State code
'country': 'str', # Assignee country
'title': 'str', # Title
'abstract': 'str', # Abstract
'gen': 'int', # USPTO data format
'file': 'text', # path to source file
'file': 'str', # path to source file
}
tabsig = ', '.join([f'{k} {v}' for k, v in schema.items()])

# database setup
if args.db is not None:
con = sqlite3.connect(args.db)
cur = con.cursor()
cur.execute(f'CREATE TABLE IF NOT EXISTS apply ({tabsig})')
cur.execute('CREATE UNIQUE INDEX IF NOT EXISTS apply_appnum ON apply (appnum)')
cur.execute('CREATE TABLE IF NOT EXISTS ipc_apply (appnum text, ipc text, rank int, ver text)')
pat_chunker = ChunkInserter(con, table='apply')
ipc_chunker = ChunkInserter(con, table='ipc_apply')
else:
pat_chunker = DummyInserter()
ipc_chunker = DummyInserter()

# chunking express
i = 0
def store_patent(pat):
global i
i += 1
schema_ipc = {
'patnum': 'str', # Patent number
'ipc': 'str', # IPC code
'rank': 'int', # Order listed
'version': 'str' # IPC version
}

# chunking express
def store_patent(pat, chunker_pat, chunker_ipc):
an, iv = pat['appnum'], pat['ipcver']

# store ipcs
for j, ipc in enumerate(pat['ipcs']):
if j == 0: pat['ipc'] = ipc
ipc_chunker.insert(an, ipc, j, iv)
chunker_ipc.insert(an, ipc, j, iv)

# store patent
pat_chunker.insert(*(pat[k] for k in schema))

# output
if args.output is not None and i % args.output == 0:
print('an = {appnum:10.10s}, fd = {appdate:10.10s}, ti = {title:30.30s}, on = {appname:30.30s}, ci = {city:15.15s}, st = {state:2s}, ct = {country:2s}'.format(**{k: pat.get(k, '') for k in schema}))
chunker_pat.insert(*(pat.get(k, '') for k in schema_apply))

# limit
if args.limit is not None and i >= args.limit:
print("Reached limit.")
return False
else:
return True

# collect files
if len(args.target) == 0 or (len(args.target) == 1 and os.path.isdir(args.target[0])):
targ_dir = 'data/apply' if len(args.target) == 0 else args.target[0]
file_list = sorted(glob.glob(f'{targ_dir}/pab*.xml')) + sorted(glob.glob(f'{targ_dir}/ipab*.xml'))
else:
file_list = args.target

# parse by generation
for fpath in file_list:
# detect generation
# file level
def parse_file(fpath, output, overwrite=False, dryrun=False, display=0):
fdir, fname = os.path.split(fpath)
ftag, fext = os.path.splitext(fname)

opath = os.path.join(output, ftag)
opath_apply = f'{opath}_apply.csv'
opath_ipc = f'{opath}_ipc.csv'

if not overwrite:
if os.path.exists(opath_apply) and os.path.exists(opath_ipc):
print(f'{ftag}: Skipping')
return

if fname.startswith('pab'):
gen = 2
main_tag = 'patent-application-publication'
Expand All @@ -204,22 +172,72 @@ def store_patent(pat):
gen = 3
parser = lambda fp: parse_wrapper(fp, 'us-patent-application', parse_apply_gen3)
else:
raise Exception('Unknown format')
raise Exception(f'{ftag}: Unknown format')

if not dryrun:
chunker_apply = ChunkWriter(opath_apply, schema=schema_apply)
chunker_ipc = ChunkWriter(opath_ipc, schema=schema_ipc)
else:
chunker_apply = DummyWriter()
chunker_ipc = DummyWriter()

# parse it up
print(f'Parsing {fname}, gen {gen}')
i0 = i
try:
parser(fpath)
print(f'{ftag}: Starting')

i = 0
for pat in parser(fpath):
i += 1

store_patent(pat, chunker_apply, chunker_ipc)

# output
if display > 0 and i % display == 0:
spat = {k: pat.get(k, '') for k in schema_apply}
print('an = {appnum:10.10s}, fd = {appdate:10.10s}, ti = {title:30.30s}, on = {appname:30.30s}, ci = {city:15.15s}, st = {state:2s}, ct = {country:2s}'.format(**spat))

# commit to db and close
chunker_apply.commit()
chunker_ipc.commit()

print(f'{ftag}: Parsed {i} patents')
except Exception as e:
print('EXCEPTION OCCURRED!')
print(f'{ftag}: EXCEPTION OCCURRED!')
print_exc()
print(f'Found {i-i0} patents, {i} total')
print()

# commit to db and close
pat_chunker.commit()
ipc_chunker.commit()
chunker_apply.delete()
chunker_ipc.delete()

if __name__ == '__main__':
import argparse
from multiprocessing import Pool

# parse input arguments
parser = argparse.ArgumentParser(description='patent application parser')
parser.add_argument('target', type=str, nargs='*', help='path or directory of file(s) to parse')
parser.add_argument('--output', type=str, default='parsed/apply', help='directory to output to')
parser.add_argument('--display', type=int, default=1000, help='how often to display summary')
parser.add_argument('--dryrun', action='store_true', help='do not actually store')
parser.add_argument('--overwrite', action='store_true', help='clobber existing files')
parser.add_argument('--cores', type=int, default=10, help='number of cores to use')
args = parser.parse_args()

# collect files
if len(args.target) == 0 or (len(args.target) == 1 and os.path.isdir(args.target[0])):
targ_dir = 'data/apply' if len(args.target) == 0 else args.target[0]
file_list = sorted(glob.glob(f'{targ_dir}/pab*.xml')) + sorted(glob.glob(f'{targ_dir}/ipab*.xml'))
else:
file_list = args.target

# ensure output dir
if not os.path.exists(args.output):
os.makedirs(args.output)

# apply options
opts = dict(overwrite=args.overwrite, dryrun=args.dryrun, display=args.display)
def parse_file_opts(fpath):
parse_file(fpath, args.output, **opts)

if args.db is not None:
con.close()
# parse files
with Pool(args.cores) as pool:
pool.map(parse_file_opts, file_list, chunksize=1)
Loading

0 comments on commit 5980194

Please sign in to comment.