Skip to content

Commit e5963a6

Browse files
Merge pull request #6 from paxtonfitzpatrick/mind2019
Mind2019
2 parents 6064897 + 3c85e14 commit e5963a6

File tree

8 files changed

+369
-62
lines changed

8 files changed

+369
-62
lines changed

eventseg_collector.py

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1 @@
11
#!/usr/bin/python
2-
3-
from eventseg_config import config
4-
import os
5-
import numpy as np
6-
import pandas as pd
7-
8-
segments_df = pd.DataFrame()
9-
10-
for root, dirs, files in os.walk(eventseg_config['resultsdir']):
11-
event_models = [f for f in files if f.endswith('.npy')]
12-
if event_models:
13-
ep_path, turkid = os.path.split(root)
14-
ep_name = os.path.split(ep_path)[1]
15-
multiindex = pd.MultiIndex.from_product([[ep_name], [turkid]])
16-
tmp_df = pd.DataFrame(index=multiindex, columns=[os.path.splitext(em)[0] for em in event_models])
17-
for e in event_models:
18-
ev_mod = np.load(os.path.join(root,e))
19-
tmp_df.at[(ep_name,turkid), os.path.splitext(e)[0]] = ev_mod
20-
21-
segments_df = segments_df.append(tmp_df)
22-
23-
segments_df.to_pickle(os.path.join(config['resultsdir'],'segments_df.p')

eventseg_config.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,14 @@
99
#add additional checks for your local machine here...
1010
# ******** check kiewit hostname from eduroam ********
1111
if (socket.gethostname() == 'Paxtons-MacBook-Pro') or (socket.gethostname() == 'Paxtons-MacBook-Pro.kiewit.dartmouth.edu') or (socket.gethostname() == 'Paxtons-MacBook-Pro.local'):
12-
config['datadir'] = '/Users/paxtonfitzpatrick/Documents/Dartmouth/Thesis/memory-dynamics/data/models/participants/trajectories'
12+
config['datadir'] = '/Users/paxtonfitzpatrick/Documents/Dartmouth/CDL/MIND-2019/narrative_complexity/data'
1313
config['workingdir'] = config['datadir']
1414
config['startdir'] = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) # directory to start the job in
1515
config['template'] = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'run_job_local.sh')
1616
else:
17-
config['datadir'] = os.path.join('/dartfs/rc/lab/D/DBIC/CDL/f0028ph/eventseg', 'trajectories')
18-
config['workingdir'] = '/dartfs/rc/lab/D/DBIC/CDL/f0028ph/eventseg/cluster-scripts'
19-
config['startdir'] = '/dartfs/rc/lab/D/DBIC/CDL/f0028ph/eventseg/'
17+
config['datadir'] = '/dartfs/rc/lab/D/DBIC/CDL/data/movie_scripts/'
18+
config['startdir'] = '/dartfs/rc/lab/D/DBIC/CDL/f0028ph/MIND-narrative-modeling'
19+
config['workingdir'] = os.path.join(startdir, 'cluster-scripts')
2020
config['template'] = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'run_job_cluster.sh')
2121

2222
config['scriptdir'] = os.path.join(config['workingdir'], 'scripts')
@@ -27,7 +27,7 @@
2727
# runtime options
2828
config['jobname'] = "eventseg" # default job name
2929
config['q'] = "largeq" # options: default, test, largeq
30-
config['nnodes'] = 1 # how many nodes to use for this one job
30+
config['nnodes'] = 10 # how many nodes to use for this one job
3131
config['ppn'] = 4 # how many processors to use for this one job (assume 4GB of RAM per processor)
3232
config['walltime'] = '20:00:00' # maximum runtime, in h:MM:SS
3333
config['cmd_wrapper'] = "python" # replace with actual command wrapper (e.g. matlab, python, etc.)

eventseg_cruncher.py

Lines changed: 59 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,68 @@
11
#!/usr/bin/python
22

3-
import sys
43
import os
4+
import sys
55
import pickle
66
import numpy as np
77
import brainiak.eventseg.event as event
8-
from eventseg_config import config
98

10-
filepath, k = sys.argv[1], sys.argv[2]
11-
dir, f_name = os.path.split(filepath)
12-
rectype = os.path.split(dir)[1]
13-
trajectory = np.load(filepath)
14-
savepath = os.path.join(config['resultsdir'], rectype, os.path.splitext(f_name)[0], 'k'+k+'.npy')
9+
script_name, k = sys.argv[1], int(sys.argv[2])
10+
11+
traj_path = os.path.join(config['datadir'], 'trajectories', f'{script_name}_traj.npy')
12+
traj = np.load(traj_path)
13+
14+
ev = event.EventSegment(k)
15+
ev.fit(traj)
16+
w = (np.round(ev.segments_[0])==1).astype(bool)
17+
segs = np.array([traj[wi, :].mean(0) for wi in w.T])
18+
19+
segments_filepath = os.path.join(config['datadir'], 'segments', script_name, f'{script_name}_events_k{str(k)}.npy'
20+
eventseg_filepath = os.path.join(config['datadir'], 'eventseg_models', script_name, f'{script_name}_eventseg_k{str(k)}.p'
21+
22+
np.save(segments_filepath, segs)
23+
with open(eventseg_filepath, 'wb') as f:
24+
pickle.dump(ev)
25+
26+
27+
28+
1529

16-
if not os.path.isfile(savepath):
17-
ev = event.EventSegment(int(k))
18-
ev.fit(trajectory)
1930

20-
np.save(savepath, ev.segments_[0])
31+
# import sys
32+
# import joblib
33+
# import numpy as np
34+
# import pandas as pd
35+
# from scipy.signal import resample
36+
# from eventseg_config import config
37+
# from helpers import *
38+
#
39+
# id = sys.argv[1]
40+
# wsize = 50
41+
#
42+
# # load only single row to save time & memory
43+
# skiprows = range(1, id)
44+
# data = pd.read_csv(os.path.join(config['datadir'], 'data.csv'), skiprows=skiprows, nrows=1).T.squeeze()
45+
# name = data.title
46+
#
47+
# # remove HTML formatting, clean script content
48+
# clean_script = cleanup_text(wipe_formatting(data.script))
49+
#
50+
# # don't model empty scripts (8,528 characters is length of shortest cleaned script)
51+
# if len(clean_script) < 8528:
52+
# sys.exit()
53+
#
54+
# cv = joblib.load(os.path.join(config['datadir'], 'fit_cv.joblib'))
55+
# lda = joblib.load(os.path.join(config['datadir'], 'fit_lda_t100.joblib'))
56+
#
57+
# sentences = cleaned.split('.')
58+
# windows = []
59+
# for ix, _ in enumerate(sentences):
60+
# windows.append(' '.join(sentences[ix:ix+wsize]))
61+
#
62+
#
63+
# script_tf = cv.transform(windows)
64+
# script_traj = resample(lda.transform(script_tf), 1000)
65+
# corrmat = np.corrcoef(script_traj)
66+
#
67+
# np.save(os.path.join(config['datadir'], 'trajectories', f'{name}_traj.npy'), script_traj)
68+
# np.save(os.path.join(config['datadir'], 'corrmats', f'{name}_corrmat.npy'), corrmat)

eventseg_submit.py

Lines changed: 57 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -10,38 +10,73 @@
1010

1111

1212
# ====== MODIFY ONLY THE CODE BETWEEN THESE LINES ======
13-
import sys
14-
15-
n_ks = sys.argv[1]
16-
17-
# each job command should be formatted as a string
18-
job_script = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'eventseg_cruncher.py')
1913

2014
job_commands = list()
2115
job_names = list()
16+
job_script = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'eventseg_cruncher.py')
2217

23-
for root, dirs, files in os.walk(config['datadir']):
24-
for file in [f for f in files if f.startswith('debug')]:
25-
filepath = os.path.join(root,file)
26-
rectype = os.path.split(root)[-1]
27-
turkid = os.path.splitext(file)[0]
28-
29-
subjdir = os.path.join(config['resultsdir'], rectype, turkid)
30-
if not os.path.isdir(subjdir):
31-
os.makedirs(subjdir, exist_ok=True)
18+
segments_dir = os.path.join(config['datadir'], 'segments')
19+
eventseg_models_dir = os.path.join(config['datadir'], 'eventseg_models')
3220

33-
for k in range(2,int(n_ks)+1):
34-
if not os.path.isfile(os.path.join(subjdir,'k'+str(k)+'.npy')):
35-
job_commands.append(' '.join([job_script, filepath, str(k)]))
36-
job_names.append('segment_' + turkid + '_' + rectype + '_k' + str(k) + '.sh')
21+
try:
22+
os.mkdir(segments_dir)
23+
except FileExistsError:
24+
pass
3725

26+
try:
27+
os.mkdir(eventseg_models_dir)
28+
except FileExistsError:
29+
pass
3830

31+
traj_dir = os.path.join(config['datadir'], 'trajectories')
32+
script_names = [f.rstrip('_traj.npy') for f in os.listdir(traj_dir) if f.endswith('traj.npy')]
3933

34+
for s in script_names:
35+
scriptseg_dir = os.path.join(segments_dir, s)
36+
script_eventsegs_dir = os.path.join(eventseg_models_dir, s)
37+
try:
38+
os.mkdir(scriptseg_dir)
39+
except FileExistsError:
40+
pass
41+
try:
42+
os.mkdir(script_eventsegs_dir)
43+
except FileExistsError:
44+
pass
45+
46+
for k in range(2, 75):
47+
job_commands.append(f'{job_script} {s} {str(k)}')
48+
job_names.append(f'segment_{s}_k{str(k)}')
49+
50+
# import pandas as pd
51+
# from helpers import download_from_google_drive as dl
52+
#
53+
# # download pre-trained CountVectorizer and LatentDirichletAllocation models
54+
# cv_id = '1qD27Os44vojkC0UUf2cYlDZ5XytotGbK'
55+
# cv_dest = os.path.join(config['datadir'], 'fit_cv.joblib')
56+
# lda_id = '1iu7X84Hd1y6Vhz8xtG2nZZ_OSolkjz9g'
57+
# lda_dest = os.path.join(config['datadir'], 'fit_lda_t100.joblib')
58+
# dl(cv_id, cv_dest)
59+
# dl(lda_id, lda_dest)
60+
#
61+
# job_script = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'eventseg_cruncher.py')
62+
#
63+
# for output in ['trajectories', 'corrmats']:
64+
# if not os.path.isdir(os.path.join(config['datadir'], output)):
65+
# os.mkdir(os.path.join(config['datadir'], output))
66+
#
67+
# # load in and clean data
68+
# data_df = pd.read_csv(os.path.join(config['datadir'], 'data.csv'))
69+
# data_df.dropna(subset=['script'], inplace=True)
70+
# data_df.drop_duplicates(subset=['title'], inplace=True)
71+
#
72+
# job_commands = list()
73+
# job_names = list()
74+
#
75+
# for _, row in data_df.iterrows():
76+
# job_commands.append(f'{job_script} {row.id}')
77+
# job_names.append(f'transform_{row.title}.sh')
4078

41-
## job_commands = map(lambda x: x[0]+" "+str(x[1]), zip([job_script]*10, range(10)))
4279

43-
# job_names should specify the file name of each script (as a list, of the same length as job_commands)
44-
## job_names = map(lambda x: str(x)+'.sh', range(len(job_commands)))
4580
# ====== MODIFY ONLY THE CODE BETWEEN THESE LINES ======
4681

4782
assert(len(job_commands) == len(job_names))

model_scripts_config.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import socket
2+
import os
3+
4+
config = dict()
5+
6+
# ====== MODIFY ONLY THE CODE BETWEEN THESE LINES ======
7+
# job creation options
8+
9+
#add additional checks for your local machine here...
10+
# ******** check kiewit hostname from eduroam ********
11+
if (socket.gethostname() == 'Paxtons-MacBook-Pro') or (socket.gethostname() == 'Paxtons-MacBook-Pro.kiewit.dartmouth.edu') or (socket.gethostname() == 'Paxtons-MacBook-Pro.local'):
12+
config['datadir'] = '/Users/paxtonfitzpatrick/Documents/Dartmouth/CDL/MIND-2019/narrative_complexity/data'
13+
config['workingdir'] = config['datadir']
14+
config['startdir'] = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) # directory to start the job in
15+
config['template'] = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'run_job_local.sh')
16+
else:
17+
config['datadir'] = '/dartfs/rc/lab/D/DBIC/CDL/data/movie_scripts/'
18+
config['startdir'] = '/dartfs/rc/lab/D/DBIC/CDL/f0028ph/MIND-narrative-modeling'
19+
config['workingdir'] = os.path.join(startdir, 'cluster-scripts')
20+
config['template'] = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'run_job_cluster.sh')
21+
22+
config['scriptdir'] = os.path.join(config['workingdir'], 'scripts')
23+
config['lockdir'] = os.path.join(config['workingdir'], 'locks')
24+
config['resultsdir'] = os.path.join(config['workingdir'], 'results')
25+
26+
27+
# runtime options
28+
config['jobname'] = "model_script" # default job name
29+
config['q'] = "largeq" # options: default, test, largeq
30+
config['nnodes'] = 1 # how many nodes to use for this one job
31+
config['ppn'] = 1 # how many processors to use for this one job (assume 4GB of RAM per processor)
32+
config['walltime'] = '1:00:00' # maximum runtime, in h:MM:SS
33+
config['cmd_wrapper'] = "python" # replace with actual command wrapper (e.g. matlab, python, etc.)
34+
35+
#extra options
36+
37+
# ====== MODIFY ONLY THE CODE BETWEEN THESE LINES ======

model_scripts_cruncher.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#!/usr/bin/python
2+
3+
import sys
4+
import joblib
5+
import numpy as np
6+
import pandas as pd
7+
from scipy.signal import resample
8+
from eventseg_config import config
9+
from helpers import *
10+
11+
id = sys.argv[1]
12+
wsize = 50
13+
14+
# load only single row to save time & memory
15+
skiprows = range(1, id)
16+
data = pd.read_csv(os.path.join(config['datadir'], 'data.csv'), skiprows=skiprows, nrows=1).T.squeeze()
17+
name = data.title
18+
19+
# remove HTML formatting, clean script content
20+
clean_script = cleanup_text(wipe_formatting(data.script))
21+
22+
# don't model empty scripts (8,528 characters is length of shortest cleaned script)
23+
if len(clean_script) < 8528:
24+
sys.exit()
25+
26+
cv = joblib.load(os.path.join(config['datadir'], 'fit_cv.joblib'))
27+
lda = joblib.load(os.path.join(config['datadir'], 'fit_lda_t100.joblib'))
28+
29+
sentences = cleaned.split('.')
30+
windows = []
31+
for ix, _ in enumerate(sentences):
32+
windows.append(' '.join(sentences[ix:ix+wsize]))
33+
34+
35+
script_tf = cv.transform(windows)
36+
script_traj = resample(lda.transform(script_tf), 1000)
37+
corrmat = np.corrcoef(script_traj)
38+
39+
np.save(os.path.join(config['datadir'], 'trajectories', f'{name}_traj.npy'), script_traj)
40+
np.save(os.path.join(config['datadir'], 'corrmats', f'{name}_corrmat.npy'), corrmat)

0 commit comments

Comments
 (0)