11from collections import defaultdict
2+ from datetime import datetime
23from django .contrib .postgres .aggregates import ArrayAgg
34from django .db .models import F , Q
45from django .utils import timezone
6+ import gzip
7+ import json
8+ import os
59from tqdm import tqdm
610
711from seqr .models import Sample , Individual , Family , Project , RnaSample , RnaSeqOutlier , RnaSeqTpm , RnaSeqSpliceOutlier
1014from seqr .utils .logging_utils import SeqrLogger
1115from seqr .utils .middleware import ErrorsWarningsException
1216from seqr .utils .xpos_utils import format_chrom
13- from seqr .views .utils .file_utils import parse_file
17+ from seqr .views .utils .file_utils import parse_file , get_temp_file_path , persist_temp_file
1418from seqr .views .utils .permissions_utils import get_internal_projects
1519from seqr .views .utils .json_utils import _to_snake_case , _to_camel_case
1620from reference_data .models import GeneInfo
@@ -321,12 +325,6 @@ def _get_splice_id(row):
321325}
322326
323327
324- # TODO
325- def load_rna_seq (data_type , * args , ** kwargs ):
326- config = RNA_DATA_TYPE_CONFIGS [data_type ]
327- return _load_rna_seq (config ['model_class' ], config ['data_type' ], * args , config ['columns' ], ** config ['additional_kwargs' ], ** kwargs )
328-
329-
330328def _validate_rna_header (header , column_map ):
331329 required_column_map = {
332330 column_map .get (col , col ): col for col in [SAMPLE_ID_COL , PROJECT_COL , GENE_ID_COL , TISSUE_COL ]
@@ -340,7 +338,7 @@ def _validate_rna_header(header, column_map):
340338
341339
342340def _load_rna_seq_file (
343- file_path , data_source , user , data_type , model_cls , potential_samples , save_data , individual_data_by_key ,
341+ file_path , data_source , user , data_type , model_cls , potential_samples , sample_files , file_dir , individual_data_by_key ,
344342 column_map , mapping_file = None , allow_missing_gene = False , ignore_extra_samples = False ,
345343):
346344 sample_id_to_individual_id_mapping = {}
@@ -364,7 +362,7 @@ def _load_rna_seq_file(
364362 _parse_rna_row (
365363 dict (zip (header , line )), column_map , required_column_map , missing_required_fields ,
366364 sample_id_to_individual_id_mapping , potential_samples , loaded_samples , gene_ids , sample_guid_keys_to_load ,
367- samples_to_create , unmatched_samples , individual_data_by_key , save_data , ignore_extra_samples ,
365+ samples_to_create , unmatched_samples , individual_data_by_key , sample_files , file_dir , ignore_extra_samples ,
368366 )
369367
370368 errors , warnings = _process_rna_errors (
@@ -384,7 +382,7 @@ def _load_rna_seq_file(
384382
385383def _parse_rna_row (row , column_map , required_column_map , missing_required_fields , sample_id_to_individual_id_mapping ,
386384 potential_samples , loaded_samples , gene_ids , sample_guid_keys_to_load , samples_to_create ,
387- unmatched_samples , individual_data_by_key , save_data , ignore_extra_samples ):
385+ unmatched_samples , individual_data_by_key , sample_files , file_dir , ignore_extra_samples ):
388386 row_dict = {mapped_key : row [col ] for mapped_key , col in column_map .items ()}
389387
390388 missing_cols = {col_id for col , col_id in required_column_map .items () if not row .get (col )}
@@ -424,7 +422,14 @@ def _parse_rna_row(row, column_map, required_column_map, missing_required_fields
424422
425423 for gene_id in row_gene_ids :
426424 row_dict = {** row_dict , GENE_ID_COL : gene_id }
427- save_data (sample_key , row_dict )
425+ if sample_key not in sample_files :
426+ file_name = _get_sample_file_path (file_dir , '_' .join (sample_key ))
427+ sample_files [sample_key ] = gzip .open (file_name , 'at' )
428+ sample_files [sample_key ].write (f'{ json .dumps (row_dict )} \n ' )
429+
430+
431+ def _get_sample_file_path (file_dir , sample_guid ):
432+ return os .path .join (file_dir , f'{ sample_guid } .json.gz' )
428433
429434
430435def _process_rna_errors (gene_ids , missing_required_fields , unmatched_samples , ignore_extra_samples , loaded_samples ):
@@ -492,7 +497,10 @@ def _match_new_sample(sample_key, samples_to_create, unmatched_samples, individu
492497 unmatched_samples .add (sample_key )
493498
494499
495- def _load_rna_seq (model_cls , data_type , file_path , save_data , * args , user = None , ** kwargs ):
500+ def load_rna_seq (data_type , file_path , user , ** kwargs ):
501+ config = RNA_DATA_TYPE_CONFIGS [data_type ]
502+ data_type = config ['data_type' ]
503+ model_cls = config ['model_class' ]
496504 projects = get_internal_projects ()
497505 data_source = file_path .split ('/' )[- 1 ].split ('_-_' )[- 1 ]
498506
@@ -503,8 +511,14 @@ def _load_rna_seq(model_cls, data_type, file_path, save_data, *args, user=None,
503511 )
504512 individual_data_by_key = _get_individuals_by_key (projects )
505513
514+ sample_files = {}
515+ file_name_prefix = f'rna_sample_data__{ data_type } __{ datetime .now ().isoformat ()} '
516+ file_dir = get_temp_file_path (file_name_prefix , is_local = True )
517+ os .mkdir (file_dir )
518+
506519 warnings , not_loaded_count , sample_guid_keys_to_load , prev_loaded_individual_ids = _load_rna_seq_file (
507- file_path , data_source , user , data_type , model_cls , potential_samples , save_data , individual_data_by_key , * args , ** kwargs )
520+ file_path , data_source , user , data_type , model_cls , potential_samples , sample_files , file_dir , individual_data_by_key ,
521+ config ['columns' ], ** config ['additional_kwargs' ], ** kwargs )
508522 message = f'Parsed { len (sample_guid_keys_to_load ) + not_loaded_count } RNA-seq samples'
509523 info = [message ]
510524 logger .info (message , user )
@@ -524,10 +538,19 @@ def _load_rna_seq(model_cls, data_type, file_path, save_data, *args, user=None,
524538 for warning in warnings :
525539 logger .warning (warning , user )
526540
527- return sample_guid_keys_to_load , info , warnings
541+ for sample_guid , sample_key in sample_guid_keys_to_load .items ():
542+ sample_files [sample_key ].close () # Required to ensure gzipped files are properly terminated
543+ os .rename (
544+ _get_sample_file_path (file_dir , '_' .join (sample_key )),
545+ _get_sample_file_path (file_dir , sample_guid ),
546+ )
547+
548+ if sample_guid_keys_to_load :
549+ persist_temp_file (file_name_prefix , user )
550+
551+ return sample_guid_keys_to_load , file_name_prefix , info , warnings
528552
529553
530- # TODO
531554def post_process_rna_data (sample_guid , data , get_unique_key = None , format_fields = None ):
532555 mismatches = set ()
533556 invalid_format_fields = defaultdict (set )
0 commit comments