Skip to content

Commit

Permalink
all parsing complete
Browse files Browse the repository at this point in the history
  • Loading branch information
iamlemec committed May 7, 2019
1 parent c4eef2e commit 5fa8ab4
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 47 deletions.
28 changes: 28 additions & 0 deletions parse_compu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import argparse
import sqlite3
import pandas as pd

# parse input arguments
parser = argparse.ArgumentParser(description='Compustat file parser.')
parser.add_argument('target', type=str, help='path of file to parse')
parser.add_argument('--db', type=str, default=None, help='database file to store to')
args = parser.parse_args()

# read frame into memory
datf = pd.read_csv(args.target, error_bad_lines=False, skiprows=1, names=[
'gvkey', 'datadate', 'year', 'name', 'assets', 'capx', 'cash', 'cogs',
'shares', 'deprec', 'income', 'employ', 'intan', 'debt', 'prefstock',
'revenue', 'sales', 'rnd', 'fcost', 'price', 'naics', 'sic'
])

# clean up data
datf['mktval'] = datf['shares']*datf['price']
datf = datf.drop(['datadate', 'shares', 'prefstock', 'price'], axis=1)
datf = datf.dropna(subset=['gvkey', 'year', 'name'])
datf['name'] = datf['name'].str.lower()
datf['naics'] = datf['naics'].fillna(0).astype(int).map(lambda x: f'{x:<06d}')

# write to sql
with sqlite3.connect(args.db) as con:
datf.to_sql('compustat', con, if_exists='replace')
con.commit()
41 changes: 0 additions & 41 deletions parse_compustat.py

This file was deleted.

13 changes: 7 additions & 6 deletions parse_maint.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# import to dataframe
print('reading table')

colspec = [(0, 7), (8, 16), (17, 18), (19, 27), (28, 36), (37, 45), (46, 51)]
colspec = [(0, 13), (14, 22), (23, 24), (25, 33), (34, 42), (43, 51), (52, 56)]
datf = pd.read_fwf(args.target, colspecs=colspec, usecols=[0, 2, 6], names=['patnum', 'is_small', 'event_code'])

# normalize patent number
Expand All @@ -27,17 +27,18 @@
mmap = [(m, 4) for m in m4] + [(m, 8) for m in m8] + [(m, 12) for m in m12]
codes = pd.DataFrame(mmap, columns=['code', 'lag']).set_index('code')

datf = datf.join(codes, on='event_code', how='left').drop('event_code', axis=1)
datf = datf.join(codes, on='event_code', how='left').dropna()
datf = datf.drop('event_code', axis=1)
datf['is_small'] = datf['is_small'] == 'Y'
pat_groups = datf.groupby('patnum')
last_maint = pat_groups['lag'].max()
ever_large = ~pat_groups['is_small'].min()
dpat = pd.DataFrame({'last_maint': last_maint, 'ever_large': ever_large})
dpat = pd.DataFrame({
'last_maint': pat_groups['lag'].max().astype(int),
'ever_large': ~pat_groups['is_small'].min().astype(bool)
})

# commit to sql
print('writing table')

with sqlite3.connect(args.db) as con:
dpat.to_sql('maint', con, if_exists='replace')
con.execute('CREATE UNIQUE INDEX maint_idx ON maint(patnum)')
con.commit()

0 comments on commit 5fa8ab4

Please sign in to comment.