diff --git a/prompt.txt b/prompt.txt index 9de97f82..8861882f 100644 --- a/prompt.txt +++ b/prompt.txt @@ -107,21 +107,28 @@ class SynSeqPreprocessor: - Records the original dtypes. - Automatically assigns dtypes (date/category/numeric) when not provided. - Converts date columns to datetime and category columns to 'category' dtype. - - For numeric columns, if one value accounts for ≥90% of non-null rows, that value is + - For numeric columns, if one value accounts for ≥90% of non‐null rows, that value is automatically marked as a special value. For each such column, a new categorical column (named base_col_cat) is created: - * If the cell value is missing, it is mapped to "NAN". - * If the cell value equals a detected (or user-specified) special value, it is mapped to that special value (as string). - * Otherwise, it is marked as "NUMERIC". + * If the cell value is missing, it is mapped to the missing marker (here, -99999999). + * If the cell value equals a detected (or user‐specified) special value, it is left as its + original numeric value. + * Otherwise, it is marked with the “numeric” marker (here, -999999999) indicating that the + value is not special. Postprocessing: - Merges back the split (base_col, base_col_cat) columns: For rows where the base column is NaN and the corresponding _cat column - indicates a special value (i.e. not "NUMERIC"), the base column is replaced - with that special value. In particular, if _cat equals "NAN", the base column is set to np.nan. - - Optionally applies user-provided rules sequentially to filter rows. + is not equal to the numeric marker, the base column is replaced + with that special value. In particular, if _cat equals the missing marker, + the base column is set to np.nan. + - Optionally applies user‐provided rules sequentially to filter rows. """ + # Define marker constants (choose values unlikely to appear in your data) + NUMERIC_MARKER = -777777777.0 # Indicates a normal (non‐special) value + MISSING_MARKER = -999999999.0 # Indicates a missing value + def __init__( self, user_dtypes: Optional[Dict[str, str]] = None, @@ -131,7 +138,7 @@ class SynSeqPreprocessor: """ Args: user_dtypes: {col: "date"/"category"/"numeric"}, if not provided, auto-detected. - user_special_values: {col: [special_value1, special_value2, ...]}. + user_special_values: {col: [special_value1, special_value2, ...]}. Even if not provided, special values are detected automatically for imbalanced numeric columns. max_categories: When auto-detecting dtypes, if nunique <= max_categories, assign 'category', else 'numeric'. """ @@ -140,8 +147,8 @@ class SynSeqPreprocessor: self.max_categories = max_categories # Internal storage - self.original_dtypes: Dict[str, str] = {} # {col: original_dtype} - self.split_map: Dict[str, str] = {} # {base_col -> cat_col} + self.original_dtypes: Dict[str, str] = {} # {col: original_dtype} + self.split_map: Dict[str, str] = {} # {base_col -> cat_col} self.detected_specials: Dict[str, List[Any]] = {} # stores the special values (detected or user-provided) # ========================================================================= @@ -249,11 +256,11 @@ class SynSeqPreprocessor: def _split_numeric_columns(self, df: pd.DataFrame): """ For each numeric column in user_special_values: - - Create a new categorical column (named base_col_cat) that marks special values. + - Create a new numeric column (named base_col_cat) that marks special values using integer markers. - For each cell in the base column: - If NaN -> returns "NAN". - If the value is in the list of special values -> returns that special value (as string). - Otherwise -> returns "NUMERIC". + If NaN -> returns the missing marker (-99999999). + If the value is in the list of special values -> returns that special value. + Otherwise -> returns the numeric marker (-999999999). """ for col, specials in self.user_special_values.items(): if col not in df.columns: @@ -261,24 +268,22 @@ class SynSeqPreprocessor: cat_col = col + "_cat" self.split_map[col] = cat_col - # Store the complete list of special values (detected or user provided) self.detected_specials[col] = specials - # Remove any existing cat_col. if cat_col in df.columns: df.drop(columns=[cat_col], inplace=True) base_idx = df.columns.get_loc(col) df.insert(base_idx, cat_col, None) - def cat_mapper(x, specials, normal_marker="NUMERIC", missing_marker="NAN"): + def cat_mapper(x, specials, normal_marker=self.NUMERIC_MARKER, missing_marker=self.MISSING_MARKER): if pd.isna(x): - return missing_marker + return float(missing_marker) elif x in specials: - return str(x) + return float(x) # retain the special value as-is else: - return normal_marker - df[cat_col] = df[col].apply(lambda x: cat_mapper(x, specials)).astype(str) - df[cat_col] = df[cat_col].astype("category") + return float(normal_marker) + + df[cat_col] = df[col].apply(lambda x: cat_mapper(x, specials)) # ========================================================================= # POSTPROCESSING @@ -288,7 +293,7 @@ class SynSeqPreprocessor: Postprocesses the synthetic DataFrame: 1) Merges back split columns (base_col, base_col_cat) by replacing NaNs in the base column with the corresponding special value (if _cat indicates a special value). - In particular, if _cat equals "NAN", the base column is set to np.nan. + In particular, if _cat equals the missing marker, the base column is set to np.nan. 2) Optionally applies user-provided rules sequentially to filter rows. (Note: Date offset restoration is not performed.) """ @@ -303,17 +308,16 @@ class SynSeqPreprocessor: def _merge_splitted_cols(self, df: pd.DataFrame) -> pd.DataFrame: """ For each (base_col, cat_col) pair in split_map: - - If a base column cell is NaN and the corresponding _cat cell is not "NUMERIC", + - If a base column cell is NaN and the corresponding _cat cell is not equal to the numeric marker, then replace the base column cell. - * If the _cat cell is "NAN", set the base column cell to np.nan. - * Otherwise, convert the _cat cell back to its original special value. + * If the _cat cell equals the missing marker, set the base column cell to np.nan. + * Otherwise, set the base column cell to the value in the _cat cell. - Finally, drop the auxiliary _cat column. """ for base_col, cat_col in self.split_map.items(): if base_col in df.columns and cat_col in df.columns: specials = self.detected_specials.get(base_col, []) - # Condition: base column is NaN and _cat is not "NUMERIC" - condition = df[base_col].isna() & (~df[cat_col].isin(["NUMERIC"])) + condition = df[base_col].isna() & (df[cat_col] != self.NUMERIC_MARKER) if condition.any(): df.loc[condition, base_col] = df.loc[condition, cat_col].apply( lambda v: self._convert_special_value(v, specials) @@ -321,16 +325,15 @@ class SynSeqPreprocessor: df.drop(columns=[cat_col], inplace=True) return df - def _convert_special_value(self, val: str, specials: List[Any]) -> Any: + def _convert_special_value(self, val: Any, specials: List[Any]) -> Any: """ - Given the string representation of a special value and the list of original special values, - returns the original special value. In particular, if val equals "NAN", return np.nan. + Given the numeric marker from a _cat column and the list of original special values, + returns the original special value. In particular, if val equals the missing marker, + returns np.nan. """ - if val == "NAN": + if val == self.MISSING_MARKER: return np.nan - for special in specials: - if str(special) == val: - return special + # If the value is one of the special values, return it; otherwise, return the value as-is. return val def apply_rules(self, df: pd.DataFrame, rules: Dict[str, List[Tuple[str, str, Any]]]) -> pd.DataFrame: @@ -397,9 +400,9 @@ This module implements a sequential synthesizer (Syn_Seq) that fits each column of the data (following an ordering provided in syn_order) one by one. It supports columns with special values. For such columns, during fitting the model only sees rows with numeric (non‐special) values. At generation time, the -pre‐generated categorical (“_cat”) column (injected by the preprocessor) is used -to decide which rows should be generated by the fitted model (when _cat equals "NUMERIC") -and which rows should directly receive the special value. +pre‐generated special indicator (“_cat”) column (injected by the preprocessor) is +used to decide which rows should be generated by the fitted model (when the _cat +cell equals the numeric marker) and which rows should directly receive the special value. """ from typing import Any, Dict, List, Optional, Tuple @@ -435,6 +438,9 @@ METHOD_MAP: Dict[str, Tuple[Any, Any]] = { "swr": (syn_swr, generate_swr), } +# --- Marker constants (changeable if needed) --- +DEFAULT_NUMERIC_MARKER = -777777777.0 # Indicates that the value is a “normal” (numeric) value +DEFAULT_MISSING_MARKER = -999999999.0 # Indicates that the original cell was missing class Syn_Seq: def __init__( @@ -450,8 +456,11 @@ class Syn_Seq: self.random_state = random_state self.sampling_patience = sampling_patience - # special_values: mapping base column name -> list of special values (e.g. {"capital-gain": [0]}) + # special_values: mapping base column name -> list of special values (e.g. { "capital-gain": [0] }) self.special_values: Dict[str, List[Any]] = {} + # cat_distributions: for each column with a _cat indicator, store its distribution from training. + self.cat_distributions: Dict[str, Dict[Any, float]] = {} + self._model_trained = False self._syn_order: List[str] = [] # synthesis order of columns @@ -459,14 +468,14 @@ class Syn_Seq: self._varsel: Dict[str, List[str]] = {} # predictors for synthesizing each column self._col_models: Dict[str, Dict[str, Any]] = {} # fitted model for each column - # For the first column—and for each numeric column with special values— - # store the observed (real) distribution (filtered to numeric values only) - self._stored_col_data: Dict[str, np.ndarray] = {} + # For the first column, store its observed (real) distribution. + self._first_col_distribution: Dict[str, np.ndarray] = {} def fit_col(self, loader: Any, *args: Any, **kwargs: Any) -> "Syn_Seq": """ Fit each column sequentially using metadata from the loader. - Also extracts special values from any _cat columns. + Also extracts special values from any _cat columns and computes the training + distribution of those columns. """ info_dict = loader.info() training_data = loader.dataframe().copy() @@ -478,14 +487,20 @@ class Syn_Seq: self._method_map = info_dict.get("method", {}) self._varsel = info_dict.get("variable_selection", {}) - # --- Extract special values from any _cat columns in the training data --- + # --- Extract special values and _cat column distributions from the training data --- for col in training_data.columns: if col.endswith("_cat"): base_col = col[:-4] - specials = training_data[col].unique() - specials = [v for v in specials if v not in ["NUMERIC"]] + # Extract all unique values except the numeric and missing markers + specials = training_data[col].unique().tolist() + specials = [v for v in specials if v != DEFAULT_NUMERIC_MARKER and v != DEFAULT_MISSING_MARKER] if specials: self.special_values[base_col] = specials + valid = training_data[col].dropna() + if len(valid) > 0: + self.cat_distributions[base_col] = valid.value_counts(normalize=True).to_dict() + else: + self.cat_distributions[base_col] = {DEFAULT_NUMERIC_MARKER: 1.0} # For auto-injected _cat columns, force method "cart" and mirror variable selection. for col in self._syn_order: @@ -500,16 +515,16 @@ class Syn_Seq: print("[INFO] Syn_Seq aggregator: fitting columns...") - # (1) For the first column, store its real (non-null) distribution. + # (1) For the first column, store its observed (non-null) distribution. first_col = self._syn_order[0] - self._stored_col_data[first_col] = training_data[first_col].dropna().values + self._first_col_distribution[first_col] = training_data[first_col].dropna().values - # (2) For columns with special values, store only rows that are NOT special. + # (2) For columns with special values, store only rows that are not special. for col, specials in self.special_values.items(): if col not in training_data.columns: continue filtered = training_data[~training_data[col].isin(specials)] - self._stored_col_data[col] = filtered[col].dropna().values + self._first_col_distribution[col] = filtered[col].dropna().values print(f"Fitting '{first_col}' => stored distribution from real data. Done.") @@ -553,9 +568,12 @@ class Syn_Seq: """ Generate `count` rows sequentially. - For columns with special values, the pre-generated _cat column (e.g. "capital-gain_cat") + For columns with special values, the pre‐generated numeric indicator (“_cat”) column is used to decide which rows should have their value generated using the fitted model - (if the _cat cell equals "NUMERIC") and which rows should directly receive the special value. + (if the _cat cell equals the DEFAULT_NUMERIC_MARKER) and which rows should directly + receive the special value. + If the generated _cat column contains no DEFAULT_NUMERIC_MARKER flag, it is re‐sampled + using the stored training distribution. """ if not self._model_trained: raise RuntimeError("Syn_Seq aggregator not yet fitted") @@ -565,10 +583,13 @@ class Syn_Seq: # Initialize a DataFrame with NaN values for all columns. gen_df = pd.DataFrame({col: [np.nan] * count for col in self._syn_order}) - # (1) Generate the first column using its stored real distribution. + # (1) Generate the first column using its stored distribution. first_col = self._syn_order[0] - if self._stored_col_data.get(first_col) is not None and len(self._stored_col_data[first_col]) > 0: - gen_df[first_col] = np.random.choice(self._stored_col_data[first_col], size=count, replace=True) + if (first_col in self._first_col_distribution and + len(self._first_col_distribution[first_col]) > 0): + gen_df[first_col] = np.random.choice( + self._first_col_distribution[first_col], size=count, replace=True + ) else: gen_df[first_col] = 0 print(f"Generating '{first_col}' => done.") @@ -576,41 +597,44 @@ class Syn_Seq: # (2) For each subsequent column, generate synthetic values. for col in self._syn_order[1:]: method_name = self._method_map.get(col, "cart") - idx = self._syn_order.index(col) - preds_list = self._varsel.get(col, self._syn_order[:idx]) + preds_list = self._varsel.get(col, self._syn_order[:self._syn_order.index(col)]) - if col in self.special_values: - # For columns with special values, rely on the pre-generated _cat column. - cat_col = col + "_cat" - if cat_col not in gen_df.columns: - raise RuntimeError(f"Expected categorical column {cat_col} not found in generated data.") - - # Check the distribution of the categorical column. - numeric_count = (gen_df[cat_col] == "NUMERIC").sum() + # Check if the column has a corresponding _cat indicator. + cat_col = col + "_cat" + if col in self.special_values and cat_col in gen_df.columns: + # Check if the generated _cat column has any numeric marker. + numeric_count = (gen_df[cat_col] == DEFAULT_NUMERIC_MARKER).sum() if numeric_count == 0: warnings.warn( - f"Degenerate _cat column generated for {col}: no rows marked as 'NUMERIC'. " - "Falling back to random sampling from stored non-special values." + f"Degenerate _cat column for {col}: no rows marked as numeric. Re-sampling _cat column using training distribution." ) - if col in self._stored_col_data and len(self._stored_col_data[col]) > 0: - fallback_values = np.random.choice( - self._stored_col_data[col], size=count, replace=True + cat_dist = self.cat_distributions.get(col, {DEFAULT_NUMERIC_MARKER: 1.0}) + total = sum(cat_dist.values()) + cat_probs = {k: v / total for k, v in cat_dist.items()} + new_cat = np.where( + np.random.rand(count) < cat_probs.get(DEFAULT_NUMERIC_MARKER, 1.0), + float(DEFAULT_NUMERIC_MARKER), + np.random.choice( + [k for k in cat_probs if k != DEFAULT_NUMERIC_MARKER], + size=count, + p=[cat_probs[k] for k in cat_probs if k != DEFAULT_NUMERIC_MARKER] ) - gen_df[col] = fallback_values - else: - gen_df[col] = 0.0 - else: - # For rows with _cat equal to "NUMERIC", generate synthetic values. - is_numeric = gen_df[cat_col] == "NUMERIC" - if is_numeric.sum() > 0: - Xsyn_numeric = gen_df.loc[is_numeric, preds_list].values - ysyn_numeric = self._generate_single_col(method_name, Xsyn_numeric, col) - gen_df.loc[is_numeric, col] = ysyn_numeric - # For rows where _cat indicates a special value, assign that value directly. - for special in self.special_values[col]: - is_special = gen_df[cat_col] == str(special) - gen_df.loc[is_special, col] = float(special) + ) + gen_df[cat_col] = new_cat + gen_df[cat_col] = gen_df[cat_col].astype(float) + + # For rows with _cat equal to the numeric marker, generate synthetic values. + is_numeric = gen_df[cat_col] == DEFAULT_NUMERIC_MARKER + if is_numeric.sum() > 0: + Xsyn_numeric = gen_df.loc[is_numeric, preds_list].values + ysyn_numeric = self._generate_single_col(method_name, Xsyn_numeric, col) + gen_df.loc[is_numeric, col] = ysyn_numeric + # For rows where _cat equals a special value, assign that special value directly. + for special in self.special_values[col]: + is_special = gen_df[cat_col] == special + gen_df.loc[is_special, col] = special else: + # Otherwise, generate the column normally. Xsyn = gen_df[preds_list].values ysyn = self._generate_single_col(method_name, Xsyn, col) gen_df[col] = ysyn @@ -620,19 +644,16 @@ class Syn_Seq: def _generate_single_col(self, method_name: str, Xsyn: np.ndarray, col: str) -> np.ndarray: """ Generate synthetic values for a single column using the fitted model. - If no model is available, falls back to sampling from the stored real distribution. + If no model is available for the column, a RuntimeError is raised. """ if col not in self._col_models or self._col_models[col] is None: - if col in self._stored_col_data and len(self._stored_col_data[col]) > 0: - return np.random.choice(self._stored_col_data[col], size=len(Xsyn), replace=True) - else: - return np.zeros(len(Xsyn)) - + raise RuntimeError(f"No model available for column {col}.") fit_info = self._col_models[col] _, generate_func = METHOD_MAP[fit_info["name"]] return generate_func(fit_info["fitted_model"], Xsyn) + src/synthcity/plugins/core/models/syn_seq/syn_seq_rules.py ** Not a valid file path or file does not exist. ** diff --git a/src/synthcity/plugins/core/models/syn_seq/syn_seq.py b/src/synthcity/plugins/core/models/syn_seq/syn_seq.py index c6f601bc..854da161 100644 --- a/src/synthcity/plugins/core/models/syn_seq/syn_seq.py +++ b/src/synthcity/plugins/core/models/syn_seq/syn_seq.py @@ -5,9 +5,9 @@ of the data (following an ordering provided in syn_order) one by one. It supports columns with special values. For such columns, during fitting the model only sees rows with numeric (non‐special) values. At generation time, the -pre‐generated categorical (“_cat”) column (injected by the preprocessor) is used -to decide which rows should be generated by the fitted model (when _cat equals "NUMERIC") -and which rows should directly receive the special value. +pre‐generated special indicator (“_cat”) column (injected by the preprocessor) is +used to decide which rows should be generated by the fitted model (when the _cat +cell equals the numeric marker) and which rows should directly receive the special value. """ from typing import Any, Dict, List, Optional, Tuple @@ -43,6 +43,9 @@ "swr": (syn_swr, generate_swr), } +# --- Marker constants (changeable if needed) --- +DEFAULT_NUMERIC_MARKER = -777777777.0 # Indicates that the value is a “normal” (numeric) value +DEFAULT_MISSING_MARKER = -999999999.0 # Indicates that the original cell was missing class Syn_Seq: def __init__( @@ -58,12 +61,11 @@ def __init__( self.random_state = random_state self.sampling_patience = sampling_patience - # special_values: mapping base column name -> list of special values (e.g. {"capital-gain": [0]}) + # special_values: mapping base column name -> list of special values (e.g. { "capital-gain": [0] }) self.special_values: Dict[str, List[Any]] = {} - # cat_distributions: mapping base column name -> distribution of the _cat column, - # for example: {"capital-gain": {"NUMERIC": 0.7, "0": 0.3}} - self.cat_distributions: Dict[str, Dict[str, float]] = {} - + # cat_distributions: for each column with a _cat indicator, store its distribution from training. + self.cat_distributions: Dict[str, Dict[Any, float]] = {} + self._model_trained = False self._syn_order: List[str] = [] # synthesis order of columns @@ -71,15 +73,14 @@ def __init__( self._varsel: Dict[str, List[str]] = {} # predictors for synthesizing each column self._col_models: Dict[str, Dict[str, Any]] = {} # fitted model for each column - # For the first column—and for each numeric column (or numeric part of a column with special values)— - # store the observed (real) distribution (filtered to numeric values only) + # For the first column, store its observed (real) distribution. self._first_col_distribution: Dict[str, np.ndarray] = {} def fit_col(self, loader: Any, *args: Any, **kwargs: Any) -> "Syn_Seq": """ Fit each column sequentially using metadata from the loader. - Also extracts special values from any _cat columns and computes the distribution - (i.e. relative frequencies) from those columns. + Also extracts special values from any _cat columns and computes the training + distribution of those columns. """ info_dict = loader.info() training_data = loader.dataframe().copy() @@ -91,17 +92,20 @@ def fit_col(self, loader: Any, *args: Any, **kwargs: Any) -> "Syn_Seq": self._method_map = info_dict.get("method", {}) self._varsel = info_dict.get("variable_selection", {}) - # --- Extract _cat column info from the training data --- + # --- Extract special values and _cat column distributions from the training data --- for col in training_data.columns: if col.endswith("_cat"): base_col = col[:-4] - # Compute the distribution from the _cat column. - dist = training_data[col].value_counts(normalize=True).to_dict() - self.cat_distributions[base_col] = dist - # Special values are those different from "NUMERIC" - specials = [k for k in dist.keys() if k != "NUMERIC"] + # Extract all unique values except the numeric and missing markers + specials = training_data[col].unique().tolist() + specials = [v for v in specials if v != DEFAULT_NUMERIC_MARKER and v != DEFAULT_MISSING_MARKER] if specials: self.special_values[base_col] = specials + valid = training_data[col].dropna() + if len(valid) > 0: + self.cat_distributions[base_col] = valid.value_counts(normalize=True).to_dict() + else: + self.cat_distributions[base_col] = {DEFAULT_NUMERIC_MARKER: 1.0} # For auto-injected _cat columns, force method "cart" and mirror variable selection. for col in self._syn_order: @@ -116,11 +120,11 @@ def fit_col(self, loader: Any, *args: Any, **kwargs: Any) -> "Syn_Seq": print("[INFO] Syn_Seq aggregator: fitting columns...") - # (1) For the first column, store its real (non-null) distribution. + # (1) For the first column, store its observed (non-null) distribution. first_col = self._syn_order[0] self._first_col_distribution[first_col] = training_data[first_col].dropna().values - # (2) For columns with special values, store only rows that are NOT special. + # (2) For columns with special values, store only rows that are not special. for col, specials in self.special_values.items(): if col not in training_data.columns: continue @@ -169,11 +173,12 @@ def generate_col(self, count: int) -> pd.DataFrame: """ Generate `count` rows sequentially. - For columns with special values, the pre‐generated categorical (“_cat”) column (e.g. "capital-gain_cat") + For columns with special values, the pre‐generated numeric indicator (“_cat”) column is used to decide which rows should have their value generated using the fitted model - (if the _cat cell equals "NUMERIC") and which rows should directly receive the special value. - If the generated _cat column contains no "NUMERIC" flag, it is re‐sampled (using the stored distribution) - to preserve the training proportions. + (if the _cat cell equals the DEFAULT_NUMERIC_MARKER) and which rows should directly + receive the special value. + If the generated _cat column contains no DEFAULT_NUMERIC_MARKER flag, it is re‐sampled + using the stored training distribution. """ if not self._model_trained: raise RuntimeError("Syn_Seq aggregator not yet fitted") @@ -183,9 +188,10 @@ def generate_col(self, count: int) -> pd.DataFrame: # Initialize a DataFrame with NaN values for all columns. gen_df = pd.DataFrame({col: [np.nan] * count for col in self._syn_order}) - # (1) Generate the first column using its stored real distribution. + # (1) Generate the first column using its stored distribution. first_col = self._syn_order[0] - if first_col in self._first_col_distribution and len(self._first_col_distribution[first_col]) > 0: + if (first_col in self._first_col_distribution and + len(self._first_col_distribution[first_col]) > 0): gen_df[first_col] = np.random.choice( self._first_col_distribution[first_col], size=count, replace=True ) @@ -196,46 +202,42 @@ def generate_col(self, count: int) -> pd.DataFrame: # (2) For each subsequent column, generate synthetic values. for col in self._syn_order[1:]: method_name = self._method_map.get(col, "cart") - idx = self._syn_order.index(col) - preds_list = self._varsel.get(col, self._syn_order[:idx]) + preds_list = self._varsel.get(col, self._syn_order[:self._syn_order.index(col)]) - # If the column has special values then it has an associated _cat indicator. + # Check if the column has a corresponding _cat indicator. cat_col = col + "_cat" if col in self.special_values and cat_col in gen_df.columns: - # Check if the generated _cat column has any "NUMERIC" flag. - numeric_count = (gen_df[cat_col] == "NUMERIC").sum() + # Check if the generated _cat column has any numeric marker. + numeric_count = (gen_df[cat_col] == DEFAULT_NUMERIC_MARKER).sum() if numeric_count == 0: warnings.warn( - f"Degenerate _cat column for {col}: no rows marked as 'NUMERIC'. " - "Re-sampling _cat column to preserve training distribution." + f"Degenerate _cat column for {col}: no rows marked as numeric. Re-sampling _cat column using training distribution." ) - # Use the stored distribution from training. - cat_dist = self.cat_distributions.get(col, {"NUMERIC": 1.0}) + cat_dist = self.cat_distributions.get(col, {DEFAULT_NUMERIC_MARKER: 1.0}) total = sum(cat_dist.values()) - # Normalize probabilities cat_probs = {k: v / total for k, v in cat_dist.items()} - # Re-sample _cat column: new_cat = np.where( - np.random.rand(count) < cat_probs.get("NUMERIC", 1.0), - "NUMERIC", + np.random.rand(count) < cat_probs.get(DEFAULT_NUMERIC_MARKER, 1.0), + float(DEFAULT_NUMERIC_MARKER), np.random.choice( - [k for k in cat_probs.keys() if k != "NUMERIC"], + [k for k in cat_probs if k != DEFAULT_NUMERIC_MARKER], size=count, - p=[cat_probs[k] for k in cat_probs if k != "NUMERIC"] + p=[cat_probs[k] for k in cat_probs if k != DEFAULT_NUMERIC_MARKER] ) ) gen_df[cat_col] = new_cat + gen_df[cat_col] = gen_df[cat_col].astype(float) - # For rows with _cat equal to "NUMERIC", generate synthetic values. - is_numeric = gen_df[cat_col] == "NUMERIC" + # For rows with _cat equal to the numeric marker, generate synthetic values. + is_numeric = gen_df[cat_col] == DEFAULT_NUMERIC_MARKER if is_numeric.sum() > 0: Xsyn_numeric = gen_df.loc[is_numeric, preds_list].values ysyn_numeric = self._generate_single_col(method_name, Xsyn_numeric, col) gen_df.loc[is_numeric, col] = ysyn_numeric - # For rows where _cat indicates a special value, assign that special value directly. + # For rows where _cat equals a special value, assign that special value directly. for special in self.special_values[col]: - is_special = gen_df[cat_col] == str(special) - gen_df.loc[is_special, col] = float(special) + is_special = gen_df[cat_col] == special + gen_df.loc[is_special, col] = special else: # Otherwise, generate the column normally. Xsyn = gen_df[preds_list].values @@ -247,15 +249,11 @@ def generate_col(self, count: int) -> pd.DataFrame: def _generate_single_col(self, method_name: str, Xsyn: np.ndarray, col: str) -> np.ndarray: """ Generate synthetic values for a single column using the fitted model. + If no model is available for the column, a RuntimeError is raised. """ if col not in self._col_models or self._col_models[col] is None: - if col in self._first_col_distribution and len(self._first_col_distribution[col]) > 0: - return np.random.choice( - self._first_col_distribution[col], size=len(Xsyn), replace=True - ) - else: - return np.zeros(len(Xsyn)) - + raise RuntimeError(f"No model available for column {col}.") fit_info = self._col_models[col] _, generate_func = METHOD_MAP[fit_info["name"]] return generate_func(fit_info["fitted_model"], Xsyn) + diff --git a/src/synthcity/plugins/core/models/syn_seq/syn_seq_preprocess.py b/src/synthcity/plugins/core/models/syn_seq/syn_seq_preprocess.py index d7b6cd1e..0ac867b7 100644 --- a/src/synthcity/plugins/core/models/syn_seq/syn_seq_preprocess.py +++ b/src/synthcity/plugins/core/models/syn_seq/syn_seq_preprocess.py @@ -11,21 +11,28 @@ class SynSeqPreprocessor: - Records the original dtypes. - Automatically assigns dtypes (date/category/numeric) when not provided. - Converts date columns to datetime and category columns to 'category' dtype. - - For numeric columns, if one value accounts for ≥90% of non-null rows, that value is + - For numeric columns, if one value accounts for ≥90% of non‐null rows, that value is automatically marked as a special value. For each such column, a new categorical column (named base_col_cat) is created: - * If the cell value is missing, it is mapped to "NAN". - * If the cell value equals a detected (or user-specified) special value, it is mapped to that special value (as string). - * Otherwise, it is marked as "NUMERIC". + * If the cell value is missing, it is mapped to the missing marker (here, -99999999). + * If the cell value equals a detected (or user‐specified) special value, it is left as its + original numeric value. + * Otherwise, it is marked with the “numeric” marker (here, -999999999) indicating that the + value is not special. Postprocessing: - Merges back the split (base_col, base_col_cat) columns: For rows where the base column is NaN and the corresponding _cat column - indicates a special value (i.e. not "NUMERIC"), the base column is replaced - with that special value. In particular, if _cat equals "NAN", the base column is set to np.nan. - - Optionally applies user-provided rules sequentially to filter rows. + is not equal to the numeric marker, the base column is replaced + with that special value. In particular, if _cat equals the missing marker, + the base column is set to np.nan. + - Optionally applies user‐provided rules sequentially to filter rows. """ + # Define marker constants (choose values unlikely to appear in your data) + NUMERIC_MARKER = -777777777.0 # Indicates a normal (non‐special) value + MISSING_MARKER = -999999999.0 # Indicates a missing value + def __init__( self, user_dtypes: Optional[Dict[str, str]] = None, @@ -35,7 +42,7 @@ def __init__( """ Args: user_dtypes: {col: "date"/"category"/"numeric"}, if not provided, auto-detected. - user_special_values: {col: [special_value1, special_value2, ...]}. + user_special_values: {col: [special_value1, special_value2, ...]}. Even if not provided, special values are detected automatically for imbalanced numeric columns. max_categories: When auto-detecting dtypes, if nunique <= max_categories, assign 'category', else 'numeric'. """ @@ -44,8 +51,8 @@ def __init__( self.max_categories = max_categories # Internal storage - self.original_dtypes: Dict[str, str] = {} # {col: original_dtype} - self.split_map: Dict[str, str] = {} # {base_col -> cat_col} + self.original_dtypes: Dict[str, str] = {} # {col: original_dtype} + self.split_map: Dict[str, str] = {} # {base_col -> cat_col} self.detected_specials: Dict[str, List[Any]] = {} # stores the special values (detected or user-provided) # ========================================================================= @@ -153,11 +160,11 @@ def _detect_special_values(self, df: pd.DataFrame): def _split_numeric_columns(self, df: pd.DataFrame): """ For each numeric column in user_special_values: - - Create a new categorical column (named base_col_cat) that marks special values. + - Create a new numeric column (named base_col_cat) that marks special values using integer markers. - For each cell in the base column: - If NaN -> returns "NAN". - If the value is in the list of special values -> returns that special value (as string). - Otherwise -> returns "NUMERIC". + If NaN -> returns the missing marker (-99999999). + If the value is in the list of special values -> returns that special value. + Otherwise -> returns the numeric marker (-999999999). """ for col, specials in self.user_special_values.items(): if col not in df.columns: @@ -165,24 +172,22 @@ def _split_numeric_columns(self, df: pd.DataFrame): cat_col = col + "_cat" self.split_map[col] = cat_col - # Store the complete list of special values (detected or user provided) self.detected_specials[col] = specials - # Remove any existing cat_col. if cat_col in df.columns: df.drop(columns=[cat_col], inplace=True) base_idx = df.columns.get_loc(col) df.insert(base_idx, cat_col, None) - def cat_mapper(x, specials, normal_marker="NUMERIC", missing_marker="NAN"): + def cat_mapper(x, specials, normal_marker=self.NUMERIC_MARKER, missing_marker=self.MISSING_MARKER): if pd.isna(x): - return missing_marker + return float(missing_marker) elif x in specials: - return str(x) + return float(x) # retain the special value as-is else: - return normal_marker - df[cat_col] = df[col].apply(lambda x: cat_mapper(x, specials)).astype(str) - df[cat_col] = df[cat_col].astype("category") + return float(normal_marker) + + df[cat_col] = df[col].apply(lambda x: cat_mapper(x, specials)) # ========================================================================= # POSTPROCESSING @@ -192,7 +197,7 @@ def postprocess(self, df: pd.DataFrame, rules: Optional[Dict[str, List[Tuple[str Postprocesses the synthetic DataFrame: 1) Merges back split columns (base_col, base_col_cat) by replacing NaNs in the base column with the corresponding special value (if _cat indicates a special value). - In particular, if _cat equals "NAN", the base column is set to np.nan. + In particular, if _cat equals the missing marker, the base column is set to np.nan. 2) Optionally applies user-provided rules sequentially to filter rows. (Note: Date offset restoration is not performed.) """ @@ -207,17 +212,16 @@ def postprocess(self, df: pd.DataFrame, rules: Optional[Dict[str, List[Tuple[str def _merge_splitted_cols(self, df: pd.DataFrame) -> pd.DataFrame: """ For each (base_col, cat_col) pair in split_map: - - If a base column cell is NaN and the corresponding _cat cell is not "NUMERIC", + - If a base column cell is NaN and the corresponding _cat cell is not equal to the numeric marker, then replace the base column cell. - * If the _cat cell is "NAN", set the base column cell to np.nan. - * Otherwise, convert the _cat cell back to its original special value. + * If the _cat cell equals the missing marker, set the base column cell to np.nan. + * Otherwise, set the base column cell to the value in the _cat cell. - Finally, drop the auxiliary _cat column. """ for base_col, cat_col in self.split_map.items(): if base_col in df.columns and cat_col in df.columns: specials = self.detected_specials.get(base_col, []) - # Condition: base column is NaN and _cat is not "NUMERIC" - condition = df[base_col].isna() & (~df[cat_col].isin(["NUMERIC"])) + condition = df[base_col].isna() & (df[cat_col] != self.NUMERIC_MARKER) if condition.any(): df.loc[condition, base_col] = df.loc[condition, cat_col].apply( lambda v: self._convert_special_value(v, specials) @@ -225,16 +229,15 @@ def _merge_splitted_cols(self, df: pd.DataFrame) -> pd.DataFrame: df.drop(columns=[cat_col], inplace=True) return df - def _convert_special_value(self, val: str, specials: List[Any]) -> Any: + def _convert_special_value(self, val: Any, specials: List[Any]) -> Any: """ - Given the string representation of a special value and the list of original special values, - returns the original special value. In particular, if val equals "NAN", return np.nan. + Given the numeric marker from a _cat column and the list of original special values, + returns the original special value. In particular, if val equals the missing marker, + returns np.nan. """ - if val == "NAN": + if val == self.MISSING_MARKER: return np.nan - for special in specials: - if str(special) == val: - return special + # If the value is one of the special values, return it; otherwise, return the value as-is. return val def apply_rules(self, df: pd.DataFrame, rules: Dict[str, List[Tuple[str, str, Any]]]) -> pd.DataFrame: diff --git a/tutorials/tutorial10_sequential_synthesis.ipynb b/tutorials/tutorial10_sequential_synthesis.ipynb index 91a58c3e..8d62e259 100644 --- a/tutorials/tutorial10_sequential_synthesis.ipynb +++ b/tutorials/tutorial10_sequential_synthesis.ipynb @@ -24,9 +24,188 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: synthcity in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (0.2.11)\n", + "Requirement already satisfied: importlib-metadata in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (8.5.0)\n", + "Requirement already satisfied: pandas>=2.1 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (2.2.3)\n", + "Requirement already satisfied: torch<2.3,>=2.1 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (2.2.2)\n", + "Requirement already satisfied: scikit-learn>=1.2 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (1.6.0)\n", + "Requirement already satisfied: nflows>=0.14 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (0.14)\n", + "Requirement already satisfied: numpy<2.0,>=1.20 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (1.26.4)\n", + "Requirement already satisfied: lifelines<0.30.0,>=0.29.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (0.29.0)\n", + "Requirement already satisfied: opacus>=1.3 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (1.5.2)\n", + "Requirement already satisfied: networkx<3.0,>2.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (2.8.8)\n", + "Requirement already satisfied: decaf-synthetic-data>=0.1.6 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (0.1.6)\n", + "Requirement already satisfied: optuna>=3.1 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (4.1.0)\n", + "Requirement already satisfied: shap in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (0.46.0)\n", + "Requirement already satisfied: tenacity in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (9.0.0)\n", + "Requirement already satisfied: tqdm in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (4.67.1)\n", + "Requirement already satisfied: loguru in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (0.7.3)\n", + "Requirement already satisfied: pydantic>=2.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (2.10.4)\n", + "Requirement already satisfied: cloudpickle in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (3.1.0)\n", + "Requirement already satisfied: scipy in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (1.13.1)\n", + "Requirement already satisfied: xgboost<3.0.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (2.1.3)\n", + "Requirement already satisfied: geomloss in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (0.2.6)\n", + "Requirement already satisfied: pgmpy in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (0.1.26)\n", + "Requirement already satisfied: redis in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (5.2.1)\n", + "Requirement already satisfied: pycox in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (0.3.0)\n", + "Requirement already satisfied: xgbse>=0.3.1 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (0.3.3)\n", + "Requirement already satisfied: pykeops in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (2.2.3)\n", + "Requirement already satisfied: fflows in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (0.0.3)\n", + "Requirement already satisfied: monai in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (1.4.0)\n", + "Requirement already satisfied: tsai in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (0.3.9)\n", + "Requirement already satisfied: be-great>=0.0.5 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (0.0.8)\n", + "Requirement already satisfied: arfpy in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from synthcity) (0.1.1)\n", + "Requirement already satisfied: datasets>=2.5.2 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from be-great>=0.0.5->synthcity) (3.2.0)\n", + "Requirement already satisfied: transformers>=4.22.1 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from be-great>=0.0.5->synthcity) (4.47.1)\n", + "Requirement already satisfied: accelerate>=0.20.1 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from be-great>=0.0.5->synthcity) (1.2.1)\n", + "Requirement already satisfied: pytorch-lightning<2.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from decaf-synthetic-data>=0.1.6->synthcity) (1.9.5)\n", + "Requirement already satisfied: torchtext>=0.10 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from decaf-synthetic-data>=0.1.6->synthcity) (0.17.2)\n", + "Requirement already satisfied: matplotlib>=3.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from lifelines<0.30.0,>=0.29.0->synthcity) (3.9.4)\n", + "Requirement already satisfied: autograd>=1.5 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from lifelines<0.30.0,>=0.29.0->synthcity) (1.7.0)\n", + "Requirement already satisfied: autograd-gamma>=0.3 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from lifelines<0.30.0,>=0.29.0->synthcity) (0.5.0)\n", + "Requirement already satisfied: formulaic>=0.2.2 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from lifelines<0.30.0,>=0.29.0->synthcity) (1.1.1)\n", + "Requirement already satisfied: tensorboard in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from nflows>=0.14->synthcity) (2.18.0)\n", + "Requirement already satisfied: opt-einsum>=3.3.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from opacus>=1.3->synthcity) (3.4.0)\n", + "Requirement already satisfied: alembic>=1.5.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from optuna>=3.1->synthcity) (1.14.0)\n", + "Requirement already satisfied: colorlog in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from optuna>=3.1->synthcity) (6.9.0)\n", + "Requirement already satisfied: packaging>=20.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from optuna>=3.1->synthcity) (24.2)\n", + "Requirement already satisfied: sqlalchemy>=1.4.2 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from optuna>=3.1->synthcity) (2.0.36)\n", + "Requirement already satisfied: PyYAML in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from optuna>=3.1->synthcity) (6.0.2)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from pandas>=2.1->synthcity) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2020.1 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from pandas>=2.1->synthcity) (2024.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from pandas>=2.1->synthcity) (2024.2)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from pydantic>=2.0->synthcity) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.27.2 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from pydantic>=2.0->synthcity) (2.27.2)\n", + "Requirement already satisfied: typing-extensions>=4.12.2 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from pydantic>=2.0->synthcity) (4.12.2)\n", + "Requirement already satisfied: joblib>=1.2.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from scikit-learn>=1.2->synthcity) (1.4.2)\n", + "Requirement already satisfied: threadpoolctl>=3.1.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from scikit-learn>=1.2->synthcity) (3.5.0)\n", + "Requirement already satisfied: filelock in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from torch<2.3,>=2.1->synthcity) (3.16.1)\n", + "Requirement already satisfied: sympy in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from torch<2.3,>=2.1->synthcity) (1.13.3)\n", + "Requirement already satisfied: jinja2 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from torch<2.3,>=2.1->synthcity) (3.1.5)\n", + "Requirement already satisfied: fsspec in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from torch<2.3,>=2.1->synthcity) (2024.9.0)\n", + "Requirement already satisfied: zipp>=3.20 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from importlib-metadata->synthcity) (3.21.0)\n", + "Requirement already satisfied: pyparsing in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from pgmpy->synthcity) (3.2.1)\n", + "Requirement already satisfied: statsmodels in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from pgmpy->synthcity) (0.14.4)\n", + "Requirement already satisfied: google-generativeai in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from pgmpy->synthcity) (0.8.3)\n", + "Requirement already satisfied: torchtuples>=0.2.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from pycox->synthcity) (0.2.2)\n", + "Requirement already satisfied: feather-format>=0.4.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from pycox->synthcity) (0.4.1)\n", + "Requirement already satisfied: h5py>=2.9.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from pycox->synthcity) (3.12.1)\n", + "Requirement already satisfied: numba>=0.44 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from pycox->synthcity) (0.60.0)\n", + "Requirement already satisfied: requests>=2.22.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from pycox->synthcity) (2.32.3)\n", + "Requirement already satisfied: py7zr>=0.11.3 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from pycox->synthcity) (0.22.0)\n", + "Requirement already satisfied: pybind11 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from pykeops->synthcity) (2.13.6)\n", + "Requirement already satisfied: keopscore==2.2.3 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from pykeops->synthcity) (2.2.3)\n", + "Requirement already satisfied: async-timeout>=4.0.3 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from redis->synthcity) (5.0.1)\n", + "Requirement already satisfied: slicer==0.0.8 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from shap->synthcity) (0.0.8)\n", + "Requirement already satisfied: fastai>=2.7.14 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from tsai->synthcity) (2.7.18)\n", + "Requirement already satisfied: pyts>=0.12.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from tsai->synthcity) (0.13.0)\n", + "Requirement already satisfied: imbalanced-learn>=0.11.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from tsai->synthcity) (0.12.4)\n", + "Requirement already satisfied: psutil>=5.4.8 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from tsai->synthcity) (5.8.0)\n", + "Requirement already satisfied: huggingface-hub>=0.21.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from accelerate>=0.20.1->be-great>=0.0.5->synthcity) (0.27.0)\n", + "Requirement already satisfied: safetensors>=0.4.3 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from accelerate>=0.20.1->be-great>=0.0.5->synthcity) (0.5.0)\n", + "Requirement already satisfied: Mako in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from alembic>=1.5.0->optuna>=3.1->synthcity) (1.3.8)\n", + "Requirement already satisfied: pyarrow>=15.0.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from datasets>=2.5.2->be-great>=0.0.5->synthcity) (18.1.0)\n", + "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from datasets>=2.5.2->be-great>=0.0.5->synthcity) (0.3.8)\n", + "Requirement already satisfied: xxhash in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from datasets>=2.5.2->be-great>=0.0.5->synthcity) (3.5.0)\n", + "Requirement already satisfied: multiprocess<0.70.17 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from datasets>=2.5.2->be-great>=0.0.5->synthcity) (0.70.16)\n", + "Requirement already satisfied: aiohttp in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from datasets>=2.5.2->be-great>=0.0.5->synthcity) (3.11.11)\n", + "Requirement already satisfied: pip in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from fastai>=2.7.14->tsai->synthcity) (24.2)\n", + "Requirement already satisfied: fastdownload<2,>=0.0.5 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from fastai>=2.7.14->tsai->synthcity) (0.0.7)\n", + "Requirement already satisfied: fastcore<1.8,>=1.5.29 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from fastai>=2.7.14->tsai->synthcity) (1.7.28)\n", + "Requirement already satisfied: torchvision>=0.11 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from fastai>=2.7.14->tsai->synthcity) (0.17.2)\n", + "Requirement already satisfied: fastprogress>=0.2.4 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from fastai>=2.7.14->tsai->synthcity) (1.0.3)\n", + "Requirement already satisfied: pillow>=9.0.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from fastai>=2.7.14->tsai->synthcity) (11.1.0)\n", + "Requirement already satisfied: spacy<4 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from fastai>=2.7.14->tsai->synthcity) (3.8.3)\n", + "Requirement already satisfied: interface-meta>=1.2.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from formulaic>=0.2.2->lifelines<0.30.0,>=0.29.0->synthcity) (1.3.0)\n", + "Requirement already satisfied: wrapt>=1.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from formulaic>=0.2.2->lifelines<0.30.0,>=0.29.0->synthcity) (1.17.0)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from matplotlib>=3.0->lifelines<0.30.0,>=0.29.0->synthcity) (1.3.0)\n", + "Requirement already satisfied: cycler>=0.10 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from matplotlib>=3.0->lifelines<0.30.0,>=0.29.0->synthcity) (0.12.1)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from matplotlib>=3.0->lifelines<0.30.0,>=0.29.0->synthcity) (4.55.3)\n", + "Requirement already satisfied: kiwisolver>=1.3.1 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from matplotlib>=3.0->lifelines<0.30.0,>=0.29.0->synthcity) (1.4.7)\n", + "Requirement already satisfied: importlib-resources>=3.2.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from matplotlib>=3.0->lifelines<0.30.0,>=0.29.0->synthcity) (6.5.2)\n", + "Requirement already satisfied: llvmlite<0.44,>=0.43.0dev0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from numba>=0.44->pycox->synthcity) (0.43.0)\n", + "Requirement already satisfied: texttable in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from py7zr>=0.11.3->pycox->synthcity) (1.7.0)\n", + "Requirement already satisfied: pycryptodomex>=3.16.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from py7zr>=0.11.3->pycox->synthcity) (3.21.0)\n", + "Requirement already satisfied: pyzstd>=0.15.9 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from py7zr>=0.11.3->pycox->synthcity) (0.16.2)\n", + "Requirement already satisfied: pyppmd<1.2.0,>=1.1.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from py7zr>=0.11.3->pycox->synthcity) (1.1.1)\n", + "Requirement already satisfied: pybcj<1.1.0,>=1.0.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from py7zr>=0.11.3->pycox->synthcity) (1.0.3)\n", + "Requirement already satisfied: multivolumefile>=0.2.3 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from py7zr>=0.11.3->pycox->synthcity) (0.2.3)\n", + "Requirement already satisfied: inflate64<1.1.0,>=1.0.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from py7zr>=0.11.3->pycox->synthcity) (1.0.1)\n", + "Requirement already satisfied: brotli>=1.1.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from py7zr>=0.11.3->pycox->synthcity) (1.1.0)\n", + "Requirement already satisfied: six>=1.5 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from python-dateutil>=2.8.2->pandas>=2.1->synthcity) (1.17.0)\n", + "Requirement already satisfied: torchmetrics>=0.7.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from pytorch-lightning<2.0->decaf-synthetic-data>=0.1.6->synthcity) (1.6.1)\n", + "Requirement already satisfied: lightning-utilities>=0.6.0.post0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from pytorch-lightning<2.0->decaf-synthetic-data>=0.1.6->synthcity) (0.11.9)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from requests>=2.22.0->pycox->synthcity) (3.4.1)\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from requests>=2.22.0->pycox->synthcity) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from requests>=2.22.0->pycox->synthcity) (2.3.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from requests>=2.22.0->pycox->synthcity) (2024.12.14)\n", + "Requirement already satisfied: regex!=2019.12.17 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from transformers>=4.22.1->be-great>=0.0.5->synthcity) (2024.11.6)\n", + "Requirement already satisfied: tokenizers<0.22,>=0.21 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from transformers>=4.22.1->be-great>=0.0.5->synthcity) (0.21.0)\n", + "Requirement already satisfied: google-ai-generativelanguage==0.6.10 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from google-generativeai->pgmpy->synthcity) (0.6.10)\n", + "Requirement already satisfied: google-api-core in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from google-generativeai->pgmpy->synthcity) (2.24.0)\n", + "Requirement already satisfied: google-api-python-client in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from google-generativeai->pgmpy->synthcity) (2.157.0)\n", + "Requirement already satisfied: google-auth>=2.15.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from google-generativeai->pgmpy->synthcity) (2.37.0)\n", + "Requirement already satisfied: protobuf in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from google-generativeai->pgmpy->synthcity) (5.29.2)\n", + "Requirement already satisfied: proto-plus<2.0.0dev,>=1.22.3 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from google-ai-generativelanguage==0.6.10->google-generativeai->pgmpy->synthcity) (1.25.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from jinja2->torch<2.3,>=2.1->synthcity) (3.0.2)\n", + "Requirement already satisfied: patsy>=0.5.6 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from statsmodels->pgmpy->synthcity) (1.0.1)\n", + "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from sympy->torch<2.3,>=2.1->synthcity) (1.3.0)\n", + "Requirement already satisfied: absl-py>=0.4 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from tensorboard->nflows>=0.14->synthcity) (2.1.0)\n", + "Requirement already satisfied: grpcio>=1.48.2 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from tensorboard->nflows>=0.14->synthcity) (1.69.0)\n", + "Requirement already satisfied: markdown>=2.6.8 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from tensorboard->nflows>=0.14->synthcity) (3.7)\n", + "Requirement already satisfied: setuptools>=41.0.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from tensorboard->nflows>=0.14->synthcity) (75.1.0)\n", + "Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from tensorboard->nflows>=0.14->synthcity) (0.7.2)\n", + "Requirement already satisfied: werkzeug>=1.0.1 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from tensorboard->nflows>=0.14->synthcity) (3.1.3)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from aiohttp->datasets>=2.5.2->be-great>=0.0.5->synthcity) (2.4.4)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from aiohttp->datasets>=2.5.2->be-great>=0.0.5->synthcity) (1.3.2)\n", + "Requirement already satisfied: attrs>=17.3.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from aiohttp->datasets>=2.5.2->be-great>=0.0.5->synthcity) (24.3.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from aiohttp->datasets>=2.5.2->be-great>=0.0.5->synthcity) (1.5.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from aiohttp->datasets>=2.5.2->be-great>=0.0.5->synthcity) (6.1.0)\n", + "Requirement already satisfied: propcache>=0.2.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from aiohttp->datasets>=2.5.2->be-great>=0.0.5->synthcity) (0.2.1)\n", + "Requirement already satisfied: yarl<2.0,>=1.17.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from aiohttp->datasets>=2.5.2->be-great>=0.0.5->synthcity) (1.18.3)\n", + "Requirement already satisfied: googleapis-common-protos<2.0.dev0,>=1.56.2 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from google-api-core->google-generativeai->pgmpy->synthcity) (1.66.0)\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from google-auth>=2.15.0->google-generativeai->pgmpy->synthcity) (5.5.0)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from google-auth>=2.15.0->google-generativeai->pgmpy->synthcity) (0.4.1)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from google-auth>=2.15.0->google-generativeai->pgmpy->synthcity) (4.9)\n", + "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from spacy<4->fastai>=2.7.14->tsai->synthcity) (3.0.12)\n", + "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from spacy<4->fastai>=2.7.14->tsai->synthcity) (1.0.5)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from spacy<4->fastai>=2.7.14->tsai->synthcity) (1.0.11)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from spacy<4->fastai>=2.7.14->tsai->synthcity) (2.0.10)\n", + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from spacy<4->fastai>=2.7.14->tsai->synthcity) (3.0.9)\n", + "Requirement already satisfied: thinc<8.4.0,>=8.3.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from spacy<4->fastai>=2.7.14->tsai->synthcity) (8.3.3)\n", + "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from spacy<4->fastai>=2.7.14->tsai->synthcity) (1.1.3)\n", + "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from spacy<4->fastai>=2.7.14->tsai->synthcity) (2.5.0)\n", + "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from spacy<4->fastai>=2.7.14->tsai->synthcity) (2.0.10)\n", + "Requirement already satisfied: weasel<0.5.0,>=0.1.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from spacy<4->fastai>=2.7.14->tsai->synthcity) (0.4.1)\n", + "Requirement already satisfied: typer<1.0.0,>=0.3.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from spacy<4->fastai>=2.7.14->tsai->synthcity) (0.15.1)\n", + "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from spacy<4->fastai>=2.7.14->tsai->synthcity) (3.5.0)\n", + "Requirement already satisfied: httplib2<1.dev0,>=0.19.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from google-api-python-client->google-generativeai->pgmpy->synthcity) (0.22.0)\n", + "Requirement already satisfied: google-auth-httplib2<1.0.0,>=0.2.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from google-api-python-client->google-generativeai->pgmpy->synthcity) (0.2.0)\n", + "Requirement already satisfied: uritemplate<5,>=3.0.1 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from google-api-python-client->google-generativeai->pgmpy->synthcity) (4.1.1)\n", + "Requirement already satisfied: grpcio-status<2.0.dev0,>=1.33.2 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.1->google-ai-generativelanguage==0.6.10->google-generativeai->pgmpy->synthcity) (1.69.0)\n", + "Requirement already satisfied: language-data>=1.2 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from langcodes<4.0.0,>=3.2.0->spacy<4->fastai>=2.7.14->tsai->synthcity) (1.3.0)\n", + "Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from pyasn1-modules>=0.2.1->google-auth>=2.15.0->google-generativeai->pgmpy->synthcity) (0.6.1)\n", + "Requirement already satisfied: blis<1.2.0,>=1.1.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from thinc<8.4.0,>=8.3.0->spacy<4->fastai>=2.7.14->tsai->synthcity) (1.1.0)\n", + "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from thinc<8.4.0,>=8.3.0->spacy<4->fastai>=2.7.14->tsai->synthcity) (0.1.5)\n", + "Requirement already satisfied: click>=8.0.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from typer<1.0.0,>=0.3.0->spacy<4->fastai>=2.7.14->tsai->synthcity) (8.1.8)\n", + "Requirement already satisfied: shellingham>=1.3.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from typer<1.0.0,>=0.3.0->spacy<4->fastai>=2.7.14->tsai->synthcity) (1.5.4)\n", + "Requirement already satisfied: rich>=10.11.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from typer<1.0.0,>=0.3.0->spacy<4->fastai>=2.7.14->tsai->synthcity) (13.9.4)\n", + "Requirement already satisfied: cloudpathlib<1.0.0,>=0.7.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from weasel<0.5.0,>=0.1.0->spacy<4->fastai>=2.7.14->tsai->synthcity) (0.20.0)\n", + "Requirement already satisfied: smart-open<8.0.0,>=5.2.1 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from weasel<0.5.0,>=0.1.0->spacy<4->fastai>=2.7.14->tsai->synthcity) (7.1.0)\n", + "Requirement already satisfied: marisa-trie>=1.1.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from language-data>=1.2->langcodes<4.0.0,>=3.2.0->spacy<4->fastai>=2.7.14->tsai->synthcity) (1.2.1)\n", + "Requirement already satisfied: markdown-it-py>=2.2.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy<4->fastai>=2.7.14->tsai->synthcity) (3.0.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy<4->fastai>=2.7.14->tsai->synthcity) (2.18.0)\n", + "Requirement already satisfied: mdurl~=0.1 in /Users/minkeychang/anaconda3/envs/syn_seq/lib/python3.9/site-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy<4->fastai>=2.7.14->tsai->synthcity) (0.1.2)\n" + ] + } + ], "source": [ "!pip install synthcity" ] @@ -519,8 +698,8 @@ "data": { "text/plain": [ "capital-gain_cat\n", - "0 44888\n", - "NUMERIC 3954\n", + " 0.0 44888\n", + "-777777777.0 3954\n", "Name: count, dtype: int64" ] }, @@ -542,8 +721,8 @@ "data": { "text/plain": [ "capital-loss_cat\n", - "0 46560\n", - "NUMERIC 2282\n", + " 0.0 46560\n", + "-777777777.0 2282\n", "Name: count, dtype: int64" ] }, @@ -806,9 +985,9 @@ "
\n", - " | age | \n", - "sex | \n", - "workclass | \n", - "education-num | \n", - "marital-status | \n", - "occupation | \n", - "relationship | \n", - "fnlwgt | \n", - "race | \n", - "capital-loss_cat | \n", - "capital-loss | \n", - "hours-per-week | \n", - "native-country | \n", - "income>50K | \n", - "capital-gain_cat | \n", - "capital-gain | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", - "9 | \n", - "1 | \n", - "0 | \n", - "4 | \n", - "0 | \n", - "6 | \n", - "2 | \n", - "22 | \n", - "0 | \n", - "0 | \n", - "50 | \n", - "39 | \n", - "0 | \n", - "0 | \n", - "0 | \n", - "7 | \n", - "
1 | \n", - "31 | \n", - "1 | \n", - "0 | \n", - "12 | \n", - "0 | \n", - "3 | \n", - "2 | \n", - "7 | \n", - "0 | \n", - "0 | \n", - "48 | \n", - "39 | \n", - "0 | \n", - "1 | \n", - "0 | \n", - "2 | \n", - "
2 | \n", - "25 | \n", - "0 | \n", - "0 | \n", - "8 | \n", - "0 | \n", - "8 | \n", - "0 | \n", - "12 | \n", - "0 | \n", - "0 | \n", - "39 | \n", - "59 | \n", - "0 | \n", - "0 | \n", - "0 | \n", - "4 | \n", - "
3 | \n", - "45 | \n", - "1 | \n", - "8 | \n", - "8 | \n", - "0 | \n", - "14 | \n", - "2 | \n", - "11 | \n", - "0 | \n", - "0 | \n", - "37 | \n", - "39 | \n", - "0 | \n", - "0 | \n", - "0 | \n", - "4 | \n", - "
4 | \n", - "49 | \n", - "0 | \n", - "8 | \n", - "8 | \n", - "4 | \n", - "14 | \n", - "3 | \n", - "8 | \n", - "4 | \n", - "0 | \n", - "34 | \n", - "39 | \n", - "0 | \n", - "0 | \n", - "0 | \n", - "2 | \n", - "
5 | \n", - "40 | \n", - "1 | \n", - "2 | \n", - "12 | \n", - "0 | \n", - "3 | \n", - "2 | \n", - "13 | \n", - "0 | \n", - "0 | \n", - "38 | \n", - "49 | \n", - "0 | \n", - "1 | \n", - "0 | \n", - "7 | \n", - "
6 | \n", - "18 | \n", - "1 | \n", - "0 | \n", - "12 | \n", - "0 | \n", - "5 | \n", - "2 | \n", - "6 | \n", - "0 | \n", - "0 | \n", - "46 | \n", - "49 | \n", - "0 | \n", - "1 | \n", - "0 | \n", - "4 | \n", - "
7 | \n", - "22 | \n", - "0 | \n", - "0 | \n", - "8 | \n", - "0 | \n", - "4 | \n", - "0 | \n", - "11 | \n", - "0 | \n", - "0 | \n", - "38 | \n", - "54 | \n", - "0 | \n", - "0 | \n", - "0 | \n", - "3 | \n", - "
8 | \n", - "1 | \n", - "0 | \n", - "0 | \n", - "5 | \n", - "2 | \n", - "3 | \n", - "1 | \n", - "8 | \n", - "0 | \n", - "0 | \n", - "48 | \n", - "14 | \n", - "0 | \n", - "0 | \n", - "0 | \n", - "15 | \n", - "
9 | \n", - "27 | \n", - "1 | \n", - "0 | \n", - "9 | \n", - "0 | \n", - "1 | \n", - "2 | \n", - "2 | \n", - "0 | \n", - "0 | \n", - "32 | \n", - "49 | \n", - "0 | \n", - "0 | \n", - "0 | \n", - "8 | \n", - "
10 | \n", - "7 | \n", - "1 | \n", - "0 | \n", - "9 | \n", - "2 | \n", - "6 | \n", - "1 | \n", - "12 | \n", - "0 | \n", - "0 | \n", - "38 | \n", - "19 | \n", - "0 | \n", - "0 | \n", - "0 | \n", - "2 | \n", - "
11 | \n", - "8 | \n", - "0 | \n", - "5 | \n", - "13 | \n", - "2 | \n", - "5 | \n", - "3 | \n", - "9 | \n", - "0 | \n", - "0 | \n", - "33 | \n", - "19 | \n", - "0 | \n", - "0 | \n", - "0 | \n", - "7 | \n", - "
12 | \n", - "1 | \n", - "1 | \n", - "0 | \n", - "8 | \n", - "2 | \n", - "2 | \n", - "4 | \n", - "20 | \n", - "0 | \n", - "0 | \n", - "32 | \n", - "19 | \n", - "0 | \n", - "0 | \n", - "0 | \n", - "10 | \n", - "
13 | \n", - "6 | \n", - "0 | \n", - "0 | \n", - "11 | \n", - "2 | \n", - "8 | \n", - "3 | \n", - "12 | \n", - "0 | \n", - "0 | \n", - "33 | \n", - "14 | \n", - "0 | \n", - "0 | \n", - "0 | \n", - "7 | \n", - "
14 | \n", - "21 | \n", - "0 | \n", - "5 | \n", - "13 | \n", - "1 | \n", - "5 | \n", - "5 | \n", - "7 | \n", - "0 | \n", - "0 | \n", - "45 | \n", - "39 | \n", - "0 | \n", - "0 | \n", - "0 | \n", - "7 | \n", - "
15 | \n", - "5 | \n", - "0 | \n", - "0 | \n", - "8 | \n", - "0 | \n", - "7 | \n", - "0 | \n", - "22 | \n", - "4 | \n", - "0 | \n", - "31 | \n", - "36 | \n", - "0 | \n", - "0 | \n", - "0 | \n", - "15 | \n", - "
16 | \n", - "31 | \n", - "1 | \n", - "0 | \n", - "8 | \n", - "0 | \n", - "1 | \n", - "2 | \n", - "4 | \n", - "0 | \n", - "0 | \n", - "34 | \n", - "39 | \n", - "0 | \n", - "0 | \n", - "0 | \n", - "15 | \n", - "
17 | \n", - "8 | \n", - "1 | \n", - "0 | \n", - "8 | \n", - "0 | \n", - "7 | \n", - "2 | \n", - "10 | \n", - "0 | \n", - "0 | \n", - "21 | \n", - "41 | \n", - "0 | \n", - "0 | \n", - "0 | \n", - "3 | \n", - "
18 | \n", - "22 | \n", - "1 | \n", - "0 | \n", - "6 | \n", - "1 | \n", - "7 | \n", - "3 | \n", - "5 | \n", - "0 | \n", - "0 | \n", - "39 | \n", - "39 | \n", - "0 | \n", - "0 | \n", - "0 | \n", - "15 | \n", - "
19 | \n", - "5 | \n", - "0 | \n", - "5 | \n", - "9 | \n", - "2 | \n", - "8 | \n", - "1 | \n", - "19 | \n", - "0 | \n", - "0 | \n", - "32 | \n", - "14 | \n", - "0 | \n", - "0 | \n", - "0 | \n", - "4 | \n", - "