From 5fa8ab4017d1d1af3c86a80d6ef82fe592c242f2 Mon Sep 17 00:00:00 2001 From: Douglas Hanley Date: Mon, 6 May 2019 22:10:41 -0400 Subject: [PATCH] all parsing complete --- parse_compu.py | 28 ++++++++++++++++++++++++++++ parse_compustat.py | 41 ----------------------------------------- parse_maint.py | 13 +++++++------ 3 files changed, 35 insertions(+), 47 deletions(-) create mode 100644 parse_compu.py delete mode 100644 parse_compustat.py diff --git a/parse_compu.py b/parse_compu.py new file mode 100644 index 0000000..9d37f34 --- /dev/null +++ b/parse_compu.py @@ -0,0 +1,28 @@ +import argparse +import sqlite3 +import pandas as pd + +# parse input arguments +parser = argparse.ArgumentParser(description='Compustat file parser.') +parser.add_argument('target', type=str, help='path of file to parse') +parser.add_argument('--db', type=str, default=None, help='database file to store to') +args = parser.parse_args() + +# read frame into memory +datf = pd.read_csv(args.target, error_bad_lines=False, skiprows=1, names=[ + 'gvkey', 'datadate', 'year', 'name', 'assets', 'capx', 'cash', 'cogs', + 'shares', 'deprec', 'income', 'employ', 'intan', 'debt', 'prefstock', + 'revenue', 'sales', 'rnd', 'fcost', 'price', 'naics', 'sic' +]) + +# clean up data +datf['mktval'] = datf['shares']*datf['price'] +datf = datf.drop(['datadate', 'shares', 'prefstock', 'price'], axis=1) +datf = datf.dropna(subset=['gvkey', 'year', 'name']) +datf['name'] = datf['name'].str.lower() +datf['naics'] = datf['naics'].fillna(0).astype(int).map(lambda x: f'{x:<06d}') + +# write to sql +with sqlite3.connect(args.db) as con: + datf.to_sql('compustat', con, if_exists='replace') + con.commit() diff --git a/parse_compustat.py b/parse_compustat.py deleted file mode 100644 index b7e0cda..0000000 --- a/parse_compustat.py +++ /dev/null @@ -1,41 +0,0 @@ -import argparse -import sqlite3 -import pandas as pd - -# parse input arguments -parser = argparse.ArgumentParser(description='Compustat file parser.') -parser.add_argument('target', type=str, help='path of file to parse') -parser.add_argument('--db', type=str, default=None, help='database file to store to') -args = parser.parse_args() - -# connect to compustat db -con = sqlite3.connect(args.db) -cur = con.cursor() - -# read frame into memory -datf = pd.read_csv(args.target, error_bad_lines=False, skiprows=1, names=[ - 'gvkey', 'datadate', 'year', 'name', 'assets', 'capx', 'cash', 'cogs', - 'shares', 'deprec', 'income', 'employ', 'intan', 'debt', 'prefstock', - 'revenue', 'sales', 'rnd', 'fcost', 'price', 'naics', 'sic', 'acquire', - 'acquire_income' -]) - -# clean up data -datf['mktval'] = datf['shares']*datf['price'] -datf = datf.drop(['datadate', 'shares', 'prefstock', 'price'], axis=1) -datf = datf.fillna({'naics': 0}) -datf['naics'] = datf['naics'].map(lambda x: int('{:<6.0f}'.format(x).replace(' ', '0'))) -datf = datf[~((datf['naics']>=520000)&(datf['naics']<530000))] # remove financial firms - -# write to sql -datf.to_sql('compustat', con, if_exists='replace') - -# clean up and generate primary key on firmyear -cur.execute("""delete from compustat where year=''""") -cur.execute("""delete from compustat where rowid not in (select min(rowid) from compustat group by gvkey,year)""") -cur.execute("""delete from compustat where name is null""") -cur.execute("""create unique index firmyear_idx on compustat(gvkey asc, year asc)""") - -# close db -con.commit() -con.close() diff --git a/parse_maint.py b/parse_maint.py index 071d46f..8996767 100644 --- a/parse_maint.py +++ b/parse_maint.py @@ -12,7 +12,7 @@ # import to dataframe print('reading table') -colspec = [(0, 7), (8, 16), (17, 18), (19, 27), (28, 36), (37, 45), (46, 51)] +colspec = [(0, 13), (14, 22), (23, 24), (25, 33), (34, 42), (43, 51), (52, 56)] datf = pd.read_fwf(args.target, colspecs=colspec, usecols=[0, 2, 6], names=['patnum', 'is_small', 'event_code']) # normalize patent number @@ -27,17 +27,18 @@ mmap = [(m, 4) for m in m4] + [(m, 8) for m in m8] + [(m, 12) for m in m12] codes = pd.DataFrame(mmap, columns=['code', 'lag']).set_index('code') -datf = datf.join(codes, on='event_code', how='left').drop('event_code', axis=1) +datf = datf.join(codes, on='event_code', how='left').dropna() +datf = datf.drop('event_code', axis=1) datf['is_small'] = datf['is_small'] == 'Y' pat_groups = datf.groupby('patnum') -last_maint = pat_groups['lag'].max() -ever_large = ~pat_groups['is_small'].min() -dpat = pd.DataFrame({'last_maint': last_maint, 'ever_large': ever_large}) +dpat = pd.DataFrame({ + 'last_maint': pat_groups['lag'].max().astype(int), + 'ever_large': ~pat_groups['is_small'].min().astype(bool) +}) # commit to sql print('writing table') with sqlite3.connect(args.db) as con: dpat.to_sql('maint', con, if_exists='replace') - con.execute('CREATE UNIQUE INDEX maint_idx ON maint(patnum)') con.commit()