-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathload_nber_patent_data.py
129 lines (102 loc) · 4.39 KB
/
load_nber_patent_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
##############################################################################
#
# load_nber_patent_data.py - load NBER patent citation data
#
#
# File: load_nber_patent_data.py
# Author: Alex Stivala
# Created: December 2018
#
##############################################################################
"""Function to load the NBER patent citation data downloaded from
http://www.nber.org/patents/
References:
Hall, B., Jaffe, A., & Trajtenberg, M. (2001). The NBER patent
citations data file: Lessons, insights and methodological tools. NBER
working paper no. 8498.
Jaffe, A. B., & Trajtenberg, M. (2002). Patents, citations, and
innovations: A window on the knowledge economy. MIT press.
Input files (in specified directory):
acite75_99.zip
apat63_99.zip
For SNAP see
http://snap.stanford.edu/snappy/index.html
Used version 4.1.0.
E.g.
(G, patdata, colnames) = load_nber_patent_data('/home/stivala/patentCitations/')
NB this uses at least 5 GB memory and tmp directory space
"""
import os,sys
import glob
import tempfile
import zipfile
import csv
import snap
#-----------------------------------------------------------------------------
#
# Functions
#
#-----------------------------------------------------------------------------
def cleanup_tmpdir(tmpdir):
"""
Remove a temporary directory and its contents
Parameters:
tmpdir - temporary directory to remove
Return value: None
"""
try:
for filename in glob.glob(os.path.join(tmpdir, "*")):
os.remove(filename)
os.rmdir(tmpdir)
except OSError, inst:
sys.stderr.write('WARNING: could not remove temp files'
' in ' + tmpdir + '\n' + str(inst) + '\n')
def load_nber_patent_data(indirname):
"""Load the NBER patent citation data from specified directory
Parameters:
indirname - path name of directory to load from
Return value:
tuple(G, patentdict, patent_colnames) where
G - SNAP TNGraph object built from the data
patentdict - dictionary mapping patent ID (int) to list
of attributes (all strings)
patent_colnames - dict mapping attribute name to
index of the patent list so e.g. we can look
up APPYEAR of userid 123 with
patent[123][patent_colnames['APPYEAR']]
Note that in SNAP, node IDs are unique integers and do not have to
be 0..N-1. So The patent ids can be used for these identifiers.
However EstimNetDirected requires the node ids in the Pajek files
for its input are numbered 1..N, so we will have to do renumbering
for the output file (EstimNetDirected input file).
"""
infilename = "acite75_99.zip"
tmpdir = tempfile.mkdtemp()
try:
zf = zipfile.ZipFile(os.path.join(indirname, infilename))
filename = os.path.join(tmpdir, "cite75_99.txt")
fout = open(filename, 'w')
# skip header line "CITING","CITED"
fout.write('\n'.join(zf.read("cite75_99.txt").splitlines()[1:]))
fout.close()
G = snap.LoadEdgeList(snap.PNGraph, filename, 0, 1, ',')
finally:
cleanup_tmpdir(tmpdir)
# http://www.nber.org/patents/pat63_99.txt
patentpath = os.path.join(indirname, "apat63_99.zip")
zf = zipfile.ZipFile(patentpath)
csviter = csv.reader(zf.open("apat63_99.txt"))
# get header line ['PATENT', 'GYEAR', 'GDATE', 'APPYEAR', 'COUNTRY', 'POSTATE', 'ASSIGNEE', 'ASSCODE', 'CLAIMS', 'NCLASS', 'CAT', 'SUBCAT', 'CMADE', 'CRECEIVE', 'RATIOCIT', 'GENERAL', 'ORIGINAL', 'FWDAPLAG', 'BCKGTLAG', 'SELFCTUB', 'SELFCTLB', 'SECDUPBD', 'SECDLWBD']
# but PATENT column 0 used as dict key so skip it
colnames = csviter.next()[1:] # skip PATENT column 0
# add column for binary attribute 1 when there is data about the patent
# (because it is in the 1963 - 1999 period in pat63_99.txt) else 0
# This field will for patents without data will be set to 0
# in convertNBERpatentDataToEstimNetDirectedFormat.py
# main when matching up the patent attributes to citations.
colnames.append('HASDATA')
patent_colnames = dict([(name, col) for (col, name) in enumerate(colnames)])
# have already read header line so rest of iterable csv read is the data
patentdata = [ (x[0], x[1:] + [1] ) for x in csviter] #append 1 for HASDATA
patentdict = dict([(int(x[0]), x[1]) for x in patentdata])
return (G, patentdict, patent_colnames)