|  | 
| 10 | 10 | 
 | 
| 11 | 11 | 
 | 
| 12 | 12 | # ====== MODIFY ONLY THE CODE BETWEEN THESE LINES ====== | 
| 13 |  | -import sys | 
| 14 |  | - | 
| 15 |  | -n_ks = sys.argv[1] | 
| 16 |  | - | 
| 17 |  | -# each job command should be formatted as a string | 
| 18 |  | -job_script = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'eventseg_cruncher.py') | 
| 19 | 13 | 
 | 
| 20 | 14 | job_commands = list() | 
| 21 | 15 | job_names = list() | 
|  | 16 | +job_script = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'eventseg_cruncher.py') | 
| 22 | 17 | 
 | 
| 23 |  | -for root, dirs, files in os.walk(config['datadir']): | 
| 24 |  | -    for file in [f for f in files if f.startswith('debug')]: | 
| 25 |  | -        filepath = os.path.join(root,file) | 
| 26 |  | -        rectype = os.path.split(root)[-1] | 
| 27 |  | -        turkid = os.path.splitext(file)[0] | 
| 28 |  | - | 
| 29 |  | -        subjdir = os.path.join(config['resultsdir'], rectype, turkid) | 
| 30 |  | -        if not os.path.isdir(subjdir): | 
| 31 |  | -            os.makedirs(subjdir, exist_ok=True) | 
|  | 18 | +segments_dir = os.path.join(config['datadir'], 'segments') | 
|  | 19 | +eventseg_models_dir = os.path.join(config['datadir'], 'eventseg_models') | 
| 32 | 20 | 
 | 
| 33 |  | -        for k in range(2,int(n_ks)+1): | 
| 34 |  | -            if not os.path.isfile(os.path.join(subjdir,'k'+str(k)+'.npy')): | 
| 35 |  | -                job_commands.append(' '.join([job_script, filepath, str(k)])) | 
| 36 |  | -                job_names.append('segment_' + turkid + '_' + rectype + '_k' + str(k) + '.sh') | 
|  | 21 | +try: | 
|  | 22 | +    os.mkdir(segments_dir) | 
|  | 23 | +except FileExistsError: | 
|  | 24 | +    pass | 
| 37 | 25 | 
 | 
|  | 26 | +try: | 
|  | 27 | +    os.mkdir(eventseg_models_dir) | 
|  | 28 | +except FileExistsError: | 
|  | 29 | +    pass | 
| 38 | 30 | 
 | 
|  | 31 | +traj_dir = os.path.join(config['datadir'], 'trajectories') | 
|  | 32 | +script_names = [f.rstrip('_traj.npy') for f in os.listdir(traj_dir) if f.endswith('traj.npy')] | 
| 39 | 33 | 
 | 
|  | 34 | +for s in script_names: | 
|  | 35 | +    scriptseg_dir = os.path.join(segments_dir, s) | 
|  | 36 | +    script_eventsegs_dir = os.path.join(eventseg_models_dir, s) | 
|  | 37 | +    try: | 
|  | 38 | +        os.mkdir(scriptseg_dir) | 
|  | 39 | +    except FileExistsError: | 
|  | 40 | +        pass | 
|  | 41 | +    try: | 
|  | 42 | +        os.mkdir(script_eventsegs_dir) | 
|  | 43 | +    except FileExistsError: | 
|  | 44 | +        pass | 
|  | 45 | + | 
|  | 46 | +    for k in range(2, 75): | 
|  | 47 | +        job_commands.append(f'{job_script} {s} {str(k)}') | 
|  | 48 | +        job_names.append(f'segment_{s}_k{str(k)}') | 
|  | 49 | + | 
|  | 50 | +# import pandas as pd | 
|  | 51 | +# from helpers import download_from_google_drive as dl | 
|  | 52 | +# | 
|  | 53 | +# # download pre-trained CountVectorizer and LatentDirichletAllocation models | 
|  | 54 | +# cv_id = '1qD27Os44vojkC0UUf2cYlDZ5XytotGbK' | 
|  | 55 | +# cv_dest = os.path.join(config['datadir'], 'fit_cv.joblib') | 
|  | 56 | +# lda_id = '1iu7X84Hd1y6Vhz8xtG2nZZ_OSolkjz9g' | 
|  | 57 | +# lda_dest = os.path.join(config['datadir'], 'fit_lda_t100.joblib') | 
|  | 58 | +# dl(cv_id, cv_dest) | 
|  | 59 | +# dl(lda_id, lda_dest) | 
|  | 60 | +# | 
|  | 61 | +# job_script = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'eventseg_cruncher.py') | 
|  | 62 | +# | 
|  | 63 | +# for output in ['trajectories', 'corrmats']: | 
|  | 64 | +#     if not os.path.isdir(os.path.join(config['datadir'], output)): | 
|  | 65 | +#         os.mkdir(os.path.join(config['datadir'], output)) | 
|  | 66 | +# | 
|  | 67 | +# # load in and clean data | 
|  | 68 | +# data_df = pd.read_csv(os.path.join(config['datadir'], 'data.csv')) | 
|  | 69 | +# data_df.dropna(subset=['script'], inplace=True) | 
|  | 70 | +# data_df.drop_duplicates(subset=['title'], inplace=True) | 
|  | 71 | +# | 
|  | 72 | +# job_commands = list() | 
|  | 73 | +# job_names = list() | 
|  | 74 | +# | 
|  | 75 | +# for _, row in data_df.iterrows(): | 
|  | 76 | +#     job_commands.append(f'{job_script} {row.id}') | 
|  | 77 | +#     job_names.append(f'transform_{row.title}.sh') | 
| 40 | 78 | 
 | 
| 41 |  | -## job_commands = map(lambda x: x[0]+" "+str(x[1]), zip([job_script]*10, range(10))) | 
| 42 | 79 | 
 | 
| 43 |  | -# job_names should specify the file name of each script (as a list, of the same length as job_commands) | 
| 44 |  | -## job_names = map(lambda x: str(x)+'.sh', range(len(job_commands))) | 
| 45 | 80 | # ====== MODIFY ONLY THE CODE BETWEEN THESE LINES ====== | 
| 46 | 81 | 
 | 
| 47 | 82 | assert(len(job_commands) == len(job_names)) | 
|  | 
0 commit comments