Skip to content

Multiple fixes for data inconsistencies across all datasets. #410

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion build/beatAML/GetBeatAML.py
Original file line number Diff line number Diff line change
Expand Up @@ -665,6 +665,7 @@ def generate_drug_list(drug_map_path,drug_path):
print(improve_map_file)
t_df = map_and_combine(t_df, "transcriptomics", args.genes, improve_map_file, sample_mapping_file)
t_df = t_df[t_df.entrez_id.notna()]
t_df = t_df[t_df.entrez_id != 0]
t_df = t_df[["improve_sample_id","transcriptomics","entrez_id","source","study"]].drop_duplicates()
t_df.to_csv("/tmp/beataml_transcriptomics.csv.gz",index=False,compression='gzip')

Expand All @@ -676,14 +677,15 @@ def generate_drug_list(drug_map_path,drug_path):
p_df = pd.melt(p_df, id_vars=['Protein'], var_name='id', value_name='proteomics')
p_df = map_and_combine(p_df, "proteomics", args.genes, improve_map_file, proteomics_map)
p_df = p_df[["improve_sample_id","proteomics","entrez_id","source","study"]]
p_df = p_df[p_df.entrez_id != 0]
p_df.to_csv("/tmp/beataml_proteomics.csv.gz",index=False,compression='gzip')

# New Mutation Data
print("Starting Mutation Data")
m_df = pd.read_csv(mutations_file, sep = '\t')

m_df = map_and_combine(m_df, "mutations", args.genes,improve_map_file, mutation_map_file)
m_df = m_df[["improve_sample_id","mutation", "entrez_id","variant_classification","source","study"]]
m_df = m_df[m_df.entrez_id != 0]
m_df.to_csv("/tmp/beataml_mutations.csv.gz",index=False,compression='gzip')

if args.exp:
Expand Down
2 changes: 1 addition & 1 deletion build/bladderpdo/00_createBladderPDOSampleFile.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def _parse_model_type(sample_id):
if "_xenoorganoid" in low:
return "xenograft derived organoid"
if "_organoid" in low:
return "organoid"
return "patient derived organoid"
if "_xenograft" in low:
return "patient derived xenograft"
if "_parental" in low:
Expand Down
14 changes: 11 additions & 3 deletions build/bladderpdo/01_createBladderPDOOmicsFiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,11 @@ def get_bladder_pdo_mutations(synObject, samples, genes):
final_mutations = merged_mutations_renamed[['entrez_id', "mutation", "variant_classification", "improve_sample_id"]]
final_mutations['study'] = "Lee etal 2018 Bladder PDOs"
final_mutations = final_mutations.dropna(subset=["entrez_id"])
final_mutations["improve_sample_id"] = final_mutations["improve_sample_id"].astype(int)
final_mutations["entrez_id"] = final_mutations["entrez_id"].astype(int)
final_mutations["improve_sample_id"] = final_mutations["improve_sample_id"].astype(int)
#drop entrez_ids equal to zero or N/A.
final_mutations = final_mutations.dropna(subset=["entrez_id"])
final_mutations["entrez_id"] = final_mutations["entrez_id"].astype(int)
final_mutations = final_mutations[final_mutations["entrez_id"] != 0]
return final_mutations

def get_bladder_pdo_copynumber(synObject, samples, genes):
Expand All @@ -124,7 +127,12 @@ def get_bladder_pdo_copynumber(synObject, samples, genes):
final_copynumber['study'] = "Lee etal 2018 Bladder PDOs"
final_copynumber = final_copynumber.dropna(subset=["entrez_id"])
final_copynumber["improve_sample_id"] = final_copynumber["improve_sample_id"].astype(int)
final_copynumber["entrez_id"] = final_copynumber["entrez_id"].astype(int)
#Drop genes that don't map to genes.csv
valid_entrez = set(genes['entrez_id'].astype(int))
final_copynumber = final_copynumber[
final_copynumber['entrez_id'].isin(valid_entrez)
]
final_copynumber["entrez_id"] = final_copynumber["entrez_id"].astype(int)
return final_copynumber


Expand Down
4 changes: 3 additions & 1 deletion build/broad_sanger/02-broadSangerOmics.R
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,7 @@ depmap_files<-function(fi,value){

res<-exp_file|>
mutate(entrez_id=as.numeric(EntrezGeneID))|>
filter(entrez_id %in% genes$entrez_id) |>
left_join(as.data.frame(depmap_vtab))

##now many variants are missing???
Expand Down Expand Up @@ -442,7 +443,8 @@ depmap_files<-function(fi,value){
print("wide to long")
res = tidyr::pivot_longer(data=exp_file,cols=c(2:ncol(exp_file)),
names_to='gene_entrez',values_to='transcriptomics',
values_transform=list(expression=as.numeric))
values_transform=list(transcriptomics=as.numeric))|>
dplyr::mutate(transcriptomics = 2^transcriptomics - 1)
colnames(res)[1]<-'other_id'

print('fixing gene names')
Expand Down
3 changes: 3 additions & 0 deletions build/broad_sanger/05b_separate_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ def main():
# Extract information to separate out datasets
exp_improve_sample_ids = exp["improve_sample_id"].unique().to_list()
exp_improve_drug_ids = exp["improve_drug_id"].unique().to_list()

#Ensure that the improve_sample_id column is in integer form.
exp = exp.with_column(pl.col("improve_sample_id").cast(pl.Float64).cast(pl.Int64))

# Write Filtered Experiments File to TSV. Then delete it from memory.
exp_filename_out = f"/tmp/{dataset}_experiments.tsv".lower()
Expand Down
9 changes: 7 additions & 2 deletions build/cptac/getCptacData.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def buildTumorSampleTable(sample_names, cancer_type, samples, maxval):
samples = samples.reset_index(drop=True)
return samples, maxval

def formatMutData(df, dtype, ctype, samp_names, source, samples):
def formatMutData(df, dtype, ctype, samp_names, source, genes, samples):
'''
Formats mutational data.
'''
Expand Down Expand Up @@ -159,6 +159,10 @@ def formatMutData(df, dtype, ctype, samp_names, source, samples):
'Mutation': 'mutation'
})
blongdf = blongdf[['improve_sample_id', 'entrez_id', 'mutation', 'variant_classification', 'source', 'study']]

#Ensure that genes that don't map to genes_file are dropped.
valid = set(genes['entrez_id'].astype(int))
blongdf = blongdf[blongdf.entrez_id.isin(valid)]
return blongdf


Expand Down Expand Up @@ -366,7 +370,7 @@ def main():
df.dropna(how='all', axis=0, inplace=True)
print(cancertype + ' ' + dtype)
if dtype == 'somatic_mutation':
fdf = formatMutData(df, 'mutation', cancertype, tumor_samps, all_sources[dtype], samples)
fdf = formatMutData(df, 'mutation', cancertype, tumor_samps, all_sources[dtype], genes, samples)
fdf = fdf.reset_index(drop=True)
dtype_key = 'mutations'
elif dtype == 'CNV':
Expand All @@ -393,6 +397,7 @@ def main():
print(df.to_string())
df['entrez_id'] = df['entrez_id'].fillna(0)
df['entrez_id'] = df['entrez_id'].astype(int)
df = df[df.entrez_id != 0]
df.to_csv("/tmp/" + "cptac_" + dtype_key + '.csv.gz', sep=',', index=False, compression='gzip')

if __name__ == '__main__':
Expand Down
6 changes: 3 additions & 3 deletions build/crcpdo/01-samples-crcpdo.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,13 +118,13 @@ def generate_sample_file(sequencing_data_path:str = None, prev_samples_path:str
for index, row in samples_df.iterrows():
if "Tumor-Organoid" in samples_df.loc[index, 'other_id']:
samples_df.loc[index, 'common_name'] = samples_df.loc[index, 'common_name'] + "T-O"
samples_df.loc[index, 'model_type'] = "organoid"
samples_df.loc[index, 'model_type'] = "patient derived organoid"
if "Tumor-Biopsy" in samples_df.loc[index, 'other_id']:
samples_df.loc[index, 'common_name'] = samples_df.loc[index, 'common_name'] + "T-B"
samples_df.loc[index, 'model_type'] = "ex vivo"
samples_df.loc[index, 'model_type'] = "tumor"
if "Normal-Organoid" in samples_df.loc[index, 'other_id']:
samples_df.loc[index, 'common_name'] = samples_df.loc[index, 'common_name'] + "N-O"
samples_df.loc[index, 'model_type'] = "organoid"
samples_df.loc[index, 'model_type'] = "patient derived organoid"
samples_df['other_id_source'] = "vandeWetering_2015"
samples_df['cancer_type'] = "Colorectal Carcinoma"
samples_df['species'] = "Homo sapiens (Human)"
Expand Down
9 changes: 6 additions & 3 deletions build/hcmi/01-createHCMISamplesFile.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,19 @@ def align_to_linkml_schema(input_df):
-------
pd.DataFrame
A copy of the input DataFrame with the 'model_type' column values mapped to
a set of predefined categories ('tumor', 'organoid', 'cell line').
a set of predefined categories ('tumor', 'patient derived organoid', 'cell line').
The mapping is designed to align the DataFrame with the LinkML schema requirements.
"""

mapping_dict = {
'Solid Tissue': 'tumor',
'3D Organoid': 'organoid',
'3D Organoid': 'patient derived organoid',
'Peripheral Blood Components NOS': 'tumor',
'Buffy Coat': np.nan,
None: np.nan,
'Peripheral Whole Blood': 'tumor',
'Adherent Cell Line': 'cell line',
'3D Neurosphere': 'organoid',
'3D Neurosphere': 'patient derived organoid',
'2D Modified Conditionally Reprogrammed Cells': 'cell line',
'Pleural Effusion': np.nan,
'Human Original Cells': 'cell line',
Expand All @@ -50,6 +50,9 @@ def align_to_linkml_schema(input_df):
input_df.dropna(subset=['model_type'], inplace=True)
input_df = input_df.sort_values(by='improve_sample_id')

#Apparently any missing cancer type is normal tissue.
input_df['cancer_type'] = input_df['cancer_type'].replace('', np.nan)
input_df['cancer_type'] = input_df['cancer_type'].fillna('Normal Tissue')
return input_df

def download_from_github(raw_url, save_path):
Expand Down
14 changes: 11 additions & 3 deletions build/hcmi/02-getHCMIData.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,7 @@ def map_and_combine(dataframe_list, data_type, metadata, entrez_map_file):

# Load mapping files using Polars
genes = pl.read_csv(entrez_map_file) # Map gene_name to entrez_id

valid_entrez = genes["entrez_id"].cast(pl.Int64).unique().to_list()
# Process each dataframe based on its data_type
while dataframe_list:
df = dataframe_list.pop()
Expand All @@ -428,8 +428,16 @@ def map_and_combine(dataframe_list, data_type, metadata, entrez_map_file):
mapped_df = mapped_df.select(['entrez_id', 'mutation', 'Variant_Classification', 'file_id'])
mapped_df = mapped_df.with_columns([pl.lit('GDC').alias('source'),
pl.lit('HCMI').alias('study')])
mapped_df = mapped_df.with_columns(mapped_df["entrez_id"].cast(str))

mapped_df = mapped_df.with_columns([
pl.col("entrez_id").cast(pl.Int64),
pl.lit('GDC' ).alias('source'),
pl.lit('HCMI').alias('study'),
])
#drop genes not in genes file.
mapped_df = mapped_df.filter(
(pl.col("entrez_id") != 0) &
pl.col("entrez_id").is_in(valid_entrez)
)
final_dataframe = pl.concat([final_dataframe, mapped_df])
del df, mapped_df
gc.collect()
Expand Down
2 changes: 1 addition & 1 deletion build/mpnst/00_sample_gen.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ sampTable<-manifest|>

##third, generate a sample for the MTs if they were generated
pdxmt<-subset(sampTable,!is.na(MicroTissueDrugFolder))
pdxmt$model_type=rep('organoid',nrow(pdxmt))
pdxmt$model_type=rep('xenograft derived organoid',nrow(pdxmt))
print(pdxmt)

main<-rbind(sampTable,pdxmt)|>
Expand Down
2 changes: 1 addition & 1 deletion build/mpnst/01_mpnst_get_omics.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ samples_df <- fread(patients)|>

pdx_samps<-subset(samples_df,model_type=='patient derived xenograft')
tumor_samps<-subset(samples_df,model_type=='tumor')
mt_samps<-subset(samples_df,model_type=='organoid')
mt_samps<-subset(samples_df,model_type=='xenograft derived organoid')

##now get the manifest from synapse
manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|>
Expand Down
8 changes: 6 additions & 2 deletions build/pancpdo/01-createPancPDOSamplesFile.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,13 @@ def align_to_linkml_schema(input_df):

mapping_dict = {
'Solid Tissue': 'tumor',
'3D Organoid': 'organoid',
'3D Organoid': 'patient derived organoid',
'Peripheral Blood Components NOS': 'tumor',
'Buffy Coat': np.nan,
None: np.nan,
'Peripheral Whole Blood': 'tumor',
'Adherent Cell Line': 'cell line',
'3D Neurosphere': 'organoid',
'3D Neurosphere': 'patient derived organoid',
'2D Modified Conditionally Reprogrammed Cells': 'cell line',
'Pleural Effusion': np.nan,
'Human Original Cells': 'cell line',
Expand Down Expand Up @@ -301,6 +301,10 @@ def filter_and_subset_data(df, maxval, mapfile):
if not missing_ids.empty:
print("\nWarning: Some samples could not be assigned an 'improve_sample_id'.")
print(missing_ids)

# Missing cancer type indicates that it is normal tissue.
longtab['cancer_type'] = longtab['cancer_type'].replace('', np.nan)
longtab['cancer_type'] = longtab['cancer_type'].fillna('Normal Tissue')
return longtab

def main():
Expand Down
6 changes: 4 additions & 2 deletions build/sarcpdo/00_createSarcPDOSampleFile.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,12 @@ def download_and_format_rna_samples(synLoginObject):
rna_samples['model_type'] = modeltypeDF[0]
# add rows by hand for SARC0139_1 that are missing from sample sheet but present in rnaseq data
addrow1 = {'other_id' : 'SARC0139_1_Tumor', 'common_name':'SARC0139_1', 'other_id_source' : 'Synapse', 'other_names':'', "cancer_type" : "Leiomyosarcoma", 'species':"Homo sapiens(Human)", 'model_type':'tumor'}
addrow2 = {'other_id' : 'SARC0139_1_Organoid', 'common_name':'SARC0139_1', 'other_id_source' : 'Synapse', 'other_names':'', "cancer_type" : "Leiomyosarcoma", 'species':"Homo sapiens(Human)", 'model_type':'organoid'}
addrow2 = {'other_id' : 'SARC0139_1_Organoid', 'common_name':'SARC0139_1', 'other_id_source' : 'Synapse', 'other_names':'', "cancer_type" : "Leiomyosarcoma", 'species':"Homo sapiens(Human)", 'model_type':'patient derived organoid'}
rna_samples.loc[len(rna_samples)] = addrow1
rna_samples.loc[len(rna_samples)] = addrow2


rna_samples.loc[rna_samples['model_type'] == 'organoid', 'model_type'] = 'patient derived organoid'

return rna_samples


Expand Down
5 changes: 5 additions & 0 deletions build/sarcpdo/01_createSarcPDOOmicsFiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ def download_and_format_transcriptomic(synLoginObject, genesTable, samplesTable)
final = melted_joined_renamed[['entrez_id', 'improve_sample_id', 'transcriptomics', 'source', 'study']]
#dropduplicates (see a few lines above - should be down here)
final = final.drop_duplicates()
# make sure entrez id is in int format.
final['entrez_id'] = final['entrez_id'].astype(int)
return final

def download_and_format_genomic_mutation(synLoginObject, genesTable, samplesTable):
Expand Down Expand Up @@ -79,6 +81,9 @@ def download_and_format_genomic_mutation(synLoginObject, genesTable, samplesTabl
mutationData =mutationData.rename({"Name": "mutation"}, axis=1)
# drop duplicates
mutationData = mutationData.drop_duplicates()
# make sure entrez_id is in integer format
mutationData['entrez_id'] = mutationData['entrez_id'].astype(int)

return mutationData


Expand Down