Skip to content
22 changes: 0 additions & 22 deletions eventseg_collector.py
Original file line number Diff line number Diff line change
@@ -1,23 +1 @@
#!/usr/bin/python

from eventseg_config import config
import os
import numpy as np
import pandas as pd

segments_df = pd.DataFrame()

for root, dirs, files in os.walk(eventseg_config['resultsdir']):
event_models = [f for f in files if f.endswith('.npy')]
if event_models:
ep_path, turkid = os.path.split(root)
ep_name = os.path.split(ep_path)[1]
multiindex = pd.MultiIndex.from_product([[ep_name], [turkid]])
tmp_df = pd.DataFrame(index=multiindex, columns=[os.path.splitext(em)[0] for em in event_models])
for e in event_models:
ev_mod = np.load(os.path.join(root,e))
tmp_df.at[(ep_name,turkid), os.path.splitext(e)[0]] = ev_mod

segments_df = segments_df.append(tmp_df)

segments_df.to_pickle(os.path.join(config['resultsdir'],'segments_df.p')
10 changes: 5 additions & 5 deletions eventseg_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@
#add additional checks for your local machine here...
# ******** check kiewit hostname from eduroam ********
if (socket.gethostname() == 'Paxtons-MacBook-Pro') or (socket.gethostname() == 'Paxtons-MacBook-Pro.kiewit.dartmouth.edu') or (socket.gethostname() == 'Paxtons-MacBook-Pro.local'):
config['datadir'] = '/Users/paxtonfitzpatrick/Documents/Dartmouth/Thesis/memory-dynamics/data/models/participants/trajectories'
config['datadir'] = '/Users/paxtonfitzpatrick/Documents/Dartmouth/CDL/MIND-2019/narrative_complexity/data'
config['workingdir'] = config['datadir']
config['startdir'] = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) # directory to start the job in
config['template'] = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'run_job_local.sh')
else:
config['datadir'] = os.path.join('/dartfs/rc/lab/D/DBIC/CDL/f0028ph/eventseg', 'trajectories')
config['workingdir'] = '/dartfs/rc/lab/D/DBIC/CDL/f0028ph/eventseg/cluster-scripts'
config['startdir'] = '/dartfs/rc/lab/D/DBIC/CDL/f0028ph/eventseg/'
config['datadir'] = '/dartfs/rc/lab/D/DBIC/CDL/data/movie_scripts/'
config['startdir'] = '/dartfs/rc/lab/D/DBIC/CDL/f0028ph/MIND-narrative-modeling'
config['workingdir'] = os.path.join(startdir, 'cluster-scripts')
config['template'] = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'run_job_cluster.sh')

config['scriptdir'] = os.path.join(config['workingdir'], 'scripts')
Expand All @@ -27,7 +27,7 @@
# runtime options
config['jobname'] = "eventseg" # default job name
config['q'] = "largeq" # options: default, test, largeq
config['nnodes'] = 1 # how many nodes to use for this one job
config['nnodes'] = 10 # how many nodes to use for this one job
config['ppn'] = 4 # how many processors to use for this one job (assume 4GB of RAM per processor)
config['walltime'] = '20:00:00' # maximum runtime, in h:MM:SS
config['cmd_wrapper'] = "python" # replace with actual command wrapper (e.g. matlab, python, etc.)
Expand Down
70 changes: 59 additions & 11 deletions eventseg_cruncher.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,68 @@
#!/usr/bin/python

import sys
import os
import sys
import pickle
import numpy as np
import brainiak.eventseg.event as event
from eventseg_config import config

filepath, k = sys.argv[1], sys.argv[2]
dir, f_name = os.path.split(filepath)
rectype = os.path.split(dir)[1]
trajectory = np.load(filepath)
savepath = os.path.join(config['resultsdir'], rectype, os.path.splitext(f_name)[0], 'k'+k+'.npy')
script_name, k = sys.argv[1], int(sys.argv[2])

traj_path = os.path.join(config['datadir'], 'trajectories', f'{script_name}_traj.npy')
traj = np.load(traj_path)

ev = event.EventSegment(k)
ev.fit(traj)
w = (np.round(ev.segments_[0])==1).astype(bool)
segs = np.array([traj[wi, :].mean(0) for wi in w.T])

segments_filepath = os.path.join(config['datadir'], 'segments', script_name, f'{script_name}_events_k{str(k)}.npy'
eventseg_filepath = os.path.join(config['datadir'], 'eventseg_models', script_name, f'{script_name}_eventseg_k{str(k)}.p'

np.save(segments_filepath, segs)
with open(eventseg_filepath, 'wb') as f:
pickle.dump(ev)





if not os.path.isfile(savepath):
ev = event.EventSegment(int(k))
ev.fit(trajectory)

np.save(savepath, ev.segments_[0])
# import sys
# import joblib
# import numpy as np
# import pandas as pd
# from scipy.signal import resample
# from eventseg_config import config
# from helpers import *
#
# id = sys.argv[1]
# wsize = 50
#
# # load only single row to save time & memory
# skiprows = range(1, id)
# data = pd.read_csv(os.path.join(config['datadir'], 'data.csv'), skiprows=skiprows, nrows=1).T.squeeze()
# name = data.title
#
# # remove HTML formatting, clean script content
# clean_script = cleanup_text(wipe_formatting(data.script))
#
# # don't model empty scripts (8,528 characters is length of shortest cleaned script)
# if len(clean_script) < 8528:
# sys.exit()
#
# cv = joblib.load(os.path.join(config['datadir'], 'fit_cv.joblib'))
# lda = joblib.load(os.path.join(config['datadir'], 'fit_lda_t100.joblib'))
#
# sentences = cleaned.split('.')
# windows = []
# for ix, _ in enumerate(sentences):
# windows.append(' '.join(sentences[ix:ix+wsize]))
#
#
# script_tf = cv.transform(windows)
# script_traj = resample(lda.transform(script_tf), 1000)
# corrmat = np.corrcoef(script_traj)
#
# np.save(os.path.join(config['datadir'], 'trajectories', f'{name}_traj.npy'), script_traj)
# np.save(os.path.join(config['datadir'], 'corrmats', f'{name}_corrmat.npy'), corrmat)
79 changes: 57 additions & 22 deletions eventseg_submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,38 +10,73 @@


# ====== MODIFY ONLY THE CODE BETWEEN THESE LINES ======
import sys

n_ks = sys.argv[1]

# each job command should be formatted as a string
job_script = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'eventseg_cruncher.py')

job_commands = list()
job_names = list()
job_script = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'eventseg_cruncher.py')

for root, dirs, files in os.walk(config['datadir']):
for file in [f for f in files if f.startswith('debug')]:
filepath = os.path.join(root,file)
rectype = os.path.split(root)[-1]
turkid = os.path.splitext(file)[0]

subjdir = os.path.join(config['resultsdir'], rectype, turkid)
if not os.path.isdir(subjdir):
os.makedirs(subjdir, exist_ok=True)
segments_dir = os.path.join(config['datadir'], 'segments')
eventseg_models_dir = os.path.join(config['datadir'], 'eventseg_models')

for k in range(2,int(n_ks)+1):
if not os.path.isfile(os.path.join(subjdir,'k'+str(k)+'.npy')):
job_commands.append(' '.join([job_script, filepath, str(k)]))
job_names.append('segment_' + turkid + '_' + rectype + '_k' + str(k) + '.sh')
try:
os.mkdir(segments_dir)
except FileExistsError:
pass

try:
os.mkdir(eventseg_models_dir)
except FileExistsError:
pass

traj_dir = os.path.join(config['datadir'], 'trajectories')
script_names = [f.rstrip('_traj.npy') for f in os.listdir(traj_dir) if f.endswith('traj.npy')]

for s in script_names:
scriptseg_dir = os.path.join(segments_dir, s)
script_eventsegs_dir = os.path.join(eventseg_models_dir, s)
try:
os.mkdir(scriptseg_dir)
except FileExistsError:
pass
try:
os.mkdir(script_eventsegs_dir)
except FileExistsError:
pass

for k in range(2, 75):
job_commands.append(f'{job_script} {s} {str(k)}')
job_names.append(f'segment_{s}_k{str(k)}')

# import pandas as pd
# from helpers import download_from_google_drive as dl
#
# # download pre-trained CountVectorizer and LatentDirichletAllocation models
# cv_id = '1qD27Os44vojkC0UUf2cYlDZ5XytotGbK'
# cv_dest = os.path.join(config['datadir'], 'fit_cv.joblib')
# lda_id = '1iu7X84Hd1y6Vhz8xtG2nZZ_OSolkjz9g'
# lda_dest = os.path.join(config['datadir'], 'fit_lda_t100.joblib')
# dl(cv_id, cv_dest)
# dl(lda_id, lda_dest)
#
# job_script = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'eventseg_cruncher.py')
#
# for output in ['trajectories', 'corrmats']:
# if not os.path.isdir(os.path.join(config['datadir'], output)):
# os.mkdir(os.path.join(config['datadir'], output))
#
# # load in and clean data
# data_df = pd.read_csv(os.path.join(config['datadir'], 'data.csv'))
# data_df.dropna(subset=['script'], inplace=True)
# data_df.drop_duplicates(subset=['title'], inplace=True)
#
# job_commands = list()
# job_names = list()
#
# for _, row in data_df.iterrows():
# job_commands.append(f'{job_script} {row.id}')
# job_names.append(f'transform_{row.title}.sh')

## job_commands = map(lambda x: x[0]+" "+str(x[1]), zip([job_script]*10, range(10)))

# job_names should specify the file name of each script (as a list, of the same length as job_commands)
## job_names = map(lambda x: str(x)+'.sh', range(len(job_commands)))
# ====== MODIFY ONLY THE CODE BETWEEN THESE LINES ======

assert(len(job_commands) == len(job_names))
Expand Down
37 changes: 37 additions & 0 deletions model_scripts_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import socket
import os

config = dict()

# ====== MODIFY ONLY THE CODE BETWEEN THESE LINES ======
# job creation options

#add additional checks for your local machine here...
# ******** check kiewit hostname from eduroam ********
if (socket.gethostname() == 'Paxtons-MacBook-Pro') or (socket.gethostname() == 'Paxtons-MacBook-Pro.kiewit.dartmouth.edu') or (socket.gethostname() == 'Paxtons-MacBook-Pro.local'):
config['datadir'] = '/Users/paxtonfitzpatrick/Documents/Dartmouth/CDL/MIND-2019/narrative_complexity/data'
config['workingdir'] = config['datadir']
config['startdir'] = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) # directory to start the job in
config['template'] = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'run_job_local.sh')
else:
config['datadir'] = '/dartfs/rc/lab/D/DBIC/CDL/data/movie_scripts/'
config['startdir'] = '/dartfs/rc/lab/D/DBIC/CDL/f0028ph/MIND-narrative-modeling'
config['workingdir'] = os.path.join(startdir, 'cluster-scripts')
config['template'] = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'run_job_cluster.sh')

config['scriptdir'] = os.path.join(config['workingdir'], 'scripts')
config['lockdir'] = os.path.join(config['workingdir'], 'locks')
config['resultsdir'] = os.path.join(config['workingdir'], 'results')


# runtime options
config['jobname'] = "model_script" # default job name
config['q'] = "largeq" # options: default, test, largeq
config['nnodes'] = 1 # how many nodes to use for this one job
config['ppn'] = 1 # how many processors to use for this one job (assume 4GB of RAM per processor)
config['walltime'] = '1:00:00' # maximum runtime, in h:MM:SS
config['cmd_wrapper'] = "python" # replace with actual command wrapper (e.g. matlab, python, etc.)

#extra options

# ====== MODIFY ONLY THE CODE BETWEEN THESE LINES ======
40 changes: 40 additions & 0 deletions model_scripts_cruncher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/usr/bin/python

import sys
import joblib
import numpy as np
import pandas as pd
from scipy.signal import resample
from eventseg_config import config
from helpers import *

id = sys.argv[1]
wsize = 50

# load only single row to save time & memory
skiprows = range(1, id)
data = pd.read_csv(os.path.join(config['datadir'], 'data.csv'), skiprows=skiprows, nrows=1).T.squeeze()
name = data.title

# remove HTML formatting, clean script content
clean_script = cleanup_text(wipe_formatting(data.script))

# don't model empty scripts (8,528 characters is length of shortest cleaned script)
if len(clean_script) < 8528:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

set to 10000 (cleaner)?

sys.exit()

cv = joblib.load(os.path.join(config['datadir'], 'fit_cv.joblib'))
lda = joblib.load(os.path.join(config['datadir'], 'fit_lda_t100.joblib'))

sentences = cleaned.split('.')
windows = []
for ix, _ in enumerate(sentences):
windows.append(' '.join(sentences[ix:ix+wsize]))


script_tf = cv.transform(windows)
script_traj = resample(lda.transform(script_tf), 1000)
corrmat = np.corrcoef(script_traj)

np.save(os.path.join(config['datadir'], 'trajectories', f'{name}_traj.npy'), script_traj)
np.save(os.path.join(config['datadir'], 'corrmats', f'{name}_corrmat.npy'), corrmat)
Loading