ht-diva · bruno-ariano · Sep 12, 2024 · Sep 9, 2024 · Sep 12, 2024 · Sep 12, 2024
diff --git a/gwasstudio/cli/export.py b/gwasstudio/cli/export.py
@@ -14,52 +14,33 @@
 
 @cloup.command("export", no_args_is_help=True, help=help_doc)
 @cloup.option_group(
-    "Options for Locusbreaker",
-    cloup.option(
-        "--hole_size",
-        default=250000,
-        help="Minimum pair-base distance between SNPs in different loci (default: 250000)",
-    ),
-    cloup.option("--locus-breaker", is_flag=True, default=False, help="Flag to use locus breaker"),
-    cloup.option("--pvalue_sig", default=5, help="pvalue threshold to use for filtering the data"),
-    cloup.option("--pvalue_limit", default=5, help="pvalue threshold to use for filtering the data"),
+    "TileDBVCF options",
+    cloup.option("--uri", default="None", help="TileDB-VCF dataset URI"),
+    cloup.option("--output-path", default="output", help="The name of the output file"),
+    cloup.option("--genome-version", help="Genome version to be used (either hg19 or hg38)", default="hg19"),
+    cloup.option("--columns", default="CHROMOSOME,POSITION,ALLELES,BETA,SE,SAMPLES,LP", help="List of columns to keep, provided as a single string comma separated"),
+    cloup.option("--chromosome", help="Chromosomes list to use during processing. This can be a list of chromosomes separated by comma (Example: 1,2,3,4)", default="1"),
+    cloup.option("--window-size", default=50000000, help="Window size used by tiledbvcf for later queries"),
+    cloup.option("--sample-partitions", help="how many partitions to divide the sample list with for computation (default is 1)", default=1),
+    cloup.option("--region-partitions", help="how many partitions to divide the region list with for computation (default is 20)", default=20)
 )
 @cloup.option_group(
-    "Options for filtering using a list od SNPs ids",
-    cloup.option("--snp_list", is_flag=True, default=False, help="A txt file with a column containing the SNP ids"),
+    "Options for Locusbreaker",
+    cloup.option("--locusbreaker", default=False, is_flag=True, help="Option to run locusbreaker"),
+    cloup.option("--samples", default="None", help="A path of a txt file containing 1 sample name per line"),
+    cloup.option("--pvalue-sig", default=5.0, help="P-value threshold to use for filtering the data"),
+    cloup.option("--pvalue-limit", default=5.0, help="P-value threshold for loci borders"),
+    cloup.option("--hole-size", default=250000, help="Minimum pair-base distance between SNPs in different loci (default: 250000)")
 )
 @cloup.option_group(
-    "TileDBVCF options",
-    cloup.option(
-        "-c", "--columns", default=False, help="List of columns to keep, provided as a single string comma separated"
-    ),
-    # Not yet implemented
-    cloup.option(
-        "--chromosome",
-        help="Chromosomes list to use during processing. This can be a list of chromosomes separated by comma (Example: 1,2,3,4)",
-        default="1",
-    ),
-    cloup.option("-g", "--genome-version", help="Genome version to be used (either hg19 or hg38)", default="hg19"),
-    cloup.option("-o", "--output-path", default="output", help="The name of the output file"),
-    cloup.option(
-        "-r",
-        "--region-partitions",
-        help="how many partition to divide the region list with for computation (default is 20)",
-        default=20,
-    ),
-    cloup.option("-s", "--samples", default=False, help="A path of a txt file containing 1 sample name per line"),
-    cloup.option(
-        "--sample-partitions",
-        help="how many partition to divide the sample list with for computation (default is 1)",
-        default=1,
-    ),
-    cloup.option("-u", "--uri", default=False, help="TileDB-VCF dataset URI"),
-    cloup.option("--window-size", default=50000000, help="Window size used by tiledbvcf for later queries"),
+    "Options for filtering using a list of SNPs ids",
+    cloup.option("--snp-list", default="None", help="A txt file with a column containing the SNP ids")
 )
 @click.pass_context
 def export(
     ctx,
     columns,
+    locusbreaker,
     pvalue_limit,
     pvalue_sig,
     hole_size,
@@ -71,28 +52,27 @@ def export(
     sample_partitions,
     region_partitions,
     samples,
-    uri,
+    uri
 ):
     cfg = ctx.obj["cfg"]
-
+    print(cfg)
     ds = tiledbvcf.Dataset(uri, mode="r", tiledb_config=cfg)
     logger.info("TileDBVCF dataset loaded")
-
     samples_list = []
-    if samples:
-        samples_file = open(samples, "r").read()
-        samples_list = [s.strip() for s in samples_file.split("\n")]
+    if samples != "None":
+        samples_file = open(samples, "r").readlines()
+        samples_list =  [s.split("\n")[0] for s in samples_file]
 
     # Create a mapping of the user selected columns into TileDB
     columns_attribute_mapping = {
         "CHROMOSOME": "contig",
         "POSITION": "pos_start",
         "SNP": "id",
         "ALLELES": "alleles",
-        "BETA": "fmt_BETA",
+        "BETA": "fmt_ES",
         "SE": "fmt_SE",
         "LP": "fmt_LP",
-        "SAMPLES": "sample",
+        "SAMPLES": "sample_name",
     }
     column_list_select = [columns_attribute_mapping[a] for a in columns.split(",")]
     column_list_select.append("pos_end")
@@ -102,42 +82,48 @@ def export(
 
     # Filter bed regions by chromosome if selected
     if chromosome:
-        bed_regions = [
-            r for r in bed_regions_all if any(r.startswith(f"{chr_num}:") for chr_num in chromosome.split(","))
-        ]
+        bed_regions = [r for r in bed_regions_all if any(r.startswith(f"{chr_num}:") for chr_num in chromosome.split(","))]
+
     else:
         bed_regions = bed_regions_all
     logger.info("bed regions created")
 
     # If locus_breaker is selected, run locus_breaker
-    if locus_breaker:
+    if locusbreaker:
+        print("running locus breaker")
         dask_df = ds.map_dask(
-            locus_breaker,
-            attrs=column_list_select,
-            regions=bed_regions,
-            samples=samples_list,
-            sample_partitions=sample_partitions,
-            region_partitions=region_partitions,
+            lambda tiledb_data: locus_breaker(
+            tiledb_data,
             pvalue_limit=pvalue_limit,
             pvalue_sig=pvalue_sig,
             hole_size=hole_size,
-            map_attributes=columns_attribute_mapping,
+            map_attributes=columns_attribute_mapping
+            ),
+            attrs=column_list_select,
+            regions=bed_regions,
+            samples=samples_list,
+            #sample_partitions=sample_partitions,
+            #region_partitions=region_partitions
         )
-        logger.info(f"Saving locus breaker output in {output_path}")
-        dask_df.to_parquet(output_path, engine="pyarrow", compression="snappy")
+        logger.info(f"Saving locus-breaker output in {output_path}")
+        dask_df.to_csv(output_path)
+        return
 
     # If snp_list is selected, run extract_snp
-    if snp_list:
-        tiledb_data_snp = extract_snp(
+    if snp_list != "None":
+        extract_snp(
             tiledb_data=ds,
-            pvalue_file_list=snp_list,
+            snp_file_list=snp_list,
             column_list_select=column_list_select,
             samples_list=samples_list,
             sample_partitions=sample_partitions,
             output_path=output_path,
+            map_attributes=columns_attribute_mapping,
         )
-        logger.info(f"Saving filtered summary statistics by SNPs in {output_path}")
-        tiledb_data_snp.to_parquet(output_path, engine="pyarrow", compression="snappy")
+        logger.info(f"Saved filtered summary statistics by SNPs in {output_path}")
+        exit()
+
+        #tiledb_data_snp.to_parquet(output_path, engine="pyarrow", compression="snappy", schema = None)
 
     # If neither locus_breaker nor snp_list is selected, filter the data by regions and samples
     else:
@@ -146,7 +132,8 @@ def export(
             regions=bed_regions,
             region_partitions=region_partitions,
             samples=samples_list,
-            sample_partitions=sample_partitions,
+            sample_partitions=sample_partitions
         )
         logger.info(f"Saving filtered GWAS by regions and samples in {output_path}")
-        filtered_ddf.to_parquet(output_path, engine="pyarrow", compression="snappy")
+        filtered_ddf.to_parquet(output_path, engine="pyarrow", compression="snappy",schema=None)
+