diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..2ffe63b --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,49 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10", "3.11", "3.12"] + + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v4 + + - name: Set up Python ${{ matrix.python-version }} + run: uv python install ${{ matrix.python-version }} + + - name: Install dependencies + run: uv sync --dev + + - name: Run tests + run: uv run pytest tests/ -v --tb=short + + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v4 + + - name: Set up Python + run: uv python install 3.12 + + - name: Install dependencies + run: uv sync --dev + + - name: Run ruff check + run: uv run ruff check . + + - name: Run ruff format check + run: uv run ruff format --check . diff --git a/examples/reports/train_hashprep_report_fixes.py b/examples/reports/train_hashprep_report_fixes.py index 235c9ab..61e21a6 100644 --- a/examples/reports/train_hashprep_report_fixes.py +++ b/examples/reports/train_hashprep_report_fixes.py @@ -3,9 +3,6 @@ Review and adapt before production use. """ -from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder -from sklearn.preprocessing import RobustScaler -import numpy as np import pandas as pd @@ -14,51 +11,51 @@ def apply_fixes(df): df = df.copy() # Column 'Cabin' has 77% missing values - df = df.drop(columns=['Cabin']) + df = df.drop(columns=["Cabin"]) # Frequency encode high-cardinality column 'Name' - freq_Name = df['Name'].value_counts(normalize=True) - df['Name_encoded'] = df['Name'].map(freq_Name) + freq_Name = df["Name"].value_counts(normalize=True) + df["Name_encoded"] = df["Name"].map(freq_Name) # Frequency encode high-cardinality column 'Ticket' - freq_Ticket = df['Ticket'].value_counts(normalize=True) - df['Ticket_encoded'] = df['Ticket'].map(freq_Ticket) + freq_Ticket = df["Ticket"].value_counts(normalize=True) + df["Ticket_encoded"] = df["Ticket"].map(freq_Ticket) # Clip outliers in 'Fare' using IQR method - q1_Fare, q3_Fare = df['Fare'].quantile([0.25, 0.75]) + q1_Fare, q3_Fare = df["Fare"].quantile([0.25, 0.75]) iqr_Fare = q3_Fare - q1_Fare lower_Fare, upper_Fare = q1_Fare - 1.5 * iqr_Fare, q3_Fare + 1.5 * iqr_Fare - df['Fare'] = df['Fare'].clip(lower=lower_Fare, upper=upper_Fare) + df["Fare"] = df["Fare"].clip(lower=lower_Fare, upper=upper_Fare) # Clip outliers in 'Parch' using IQR method - q1_Parch, q3_Parch = df['Parch'].quantile([0.25, 0.75]) + q1_Parch, q3_Parch = df["Parch"].quantile([0.25, 0.75]) iqr_Parch = q3_Parch - q1_Parch lower_Parch, upper_Parch = q1_Parch - 1.5 * iqr_Parch, q3_Parch + 1.5 * iqr_Parch - df['Parch'] = df['Parch'].clip(lower=lower_Parch, upper=upper_Parch) + df["Parch"] = df["Parch"].clip(lower=lower_Parch, upper=upper_Parch) # Clip outliers in 'SibSp' using IQR method - q1_SibSp, q3_SibSp = df['SibSp'].quantile([0.25, 0.75]) + q1_SibSp, q3_SibSp = df["SibSp"].quantile([0.25, 0.75]) iqr_SibSp = q3_SibSp - q1_SibSp lower_SibSp, upper_SibSp = q1_SibSp - 1.5 * iqr_SibSp, q3_SibSp + 1.5 * iqr_SibSp - df['SibSp'] = df['SibSp'].clip(lower=lower_SibSp, upper=upper_SibSp) + df["SibSp"] = df["SibSp"].clip(lower=lower_SibSp, upper=upper_SibSp) # Drop highly correlated column 'Survived,Sex' - df = df.drop(columns=['Survived,Sex']) + df = df.drop(columns=["Survived,Sex"]) return df -if __name__ == '__main__': +if __name__ == "__main__": import sys if len(sys.argv) < 2: - print('Usage: python fixes.py [output.csv]') + print("Usage: python fixes.py [output.csv]") sys.exit(1) input_file = sys.argv[1] - output_file = sys.argv[2] if len(sys.argv) > 2 else 'cleaned_data.csv' + output_file = sys.argv[2] if len(sys.argv) > 2 else "cleaned_data.csv" df = pd.read_csv(input_file) df_clean = apply_fixes(df) df_clean.to_csv(output_file, index=False) - print(f'Cleaned data saved to {output_file}') \ No newline at end of file + print(f"Cleaned data saved to {output_file}") diff --git a/examples/reports/train_hashprep_report_pipeline.py b/examples/reports/train_hashprep_report_pipeline.py index 81c0478..3e19d09 100644 --- a/examples/reports/train_hashprep_report_pipeline.py +++ b/examples/reports/train_hashprep_report_pipeline.py @@ -5,27 +5,27 @@ from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline -from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder -import numpy as np def build_preprocessing_pipeline(): """Build sklearn preprocessing pipeline.""" transformers = [ - ('drop_column_Cabin', 'drop', ['Cabin']), - ('drop_column_Survived,S', 'drop', ['Survived,Sex']), + ("drop_column_Cabin", "drop", ["Cabin"]), + ("drop_column_Survived,S", "drop", ["Survived,Sex"]), ] preprocessor = ColumnTransformer( transformers=transformers, - remainder='passthrough', + remainder="passthrough", verbose_feature_names_out=False, ) - pipeline = Pipeline([ - ('preprocessor', preprocessor), - ]) + pipeline = Pipeline( + [ + ("preprocessor", preprocessor), + ] + ) return pipeline @@ -37,20 +37,18 @@ def get_pre_pipeline_steps(): """ steps = [] # Outlier clipping for ['Fare'] - steps.append(('clip_outliers_Fare', None)) # Implement manually + steps.append(("clip_outliers_Fare", None)) # Implement manually # Outlier clipping for ['Parch'] - steps.append(('clip_outliers_Parch', None)) # Implement manually + steps.append(("clip_outliers_Parch", None)) # Implement manually # Outlier clipping for ['SibSp'] - steps.append(('clip_outliers_SibSp', None)) # Implement manually + steps.append(("clip_outliers_SibSp", None)) # Implement manually return steps -if __name__ == '__main__': - import joblib - +if __name__ == "__main__": pipeline = build_preprocessing_pipeline() if pipeline: - print('Pipeline created successfully') + print("Pipeline created successfully") print(pipeline) # Example: Save pipeline - # joblib.dump(pipeline, 'preprocessing_pipeline.joblib') \ No newline at end of file + # joblib.dump(pipeline, 'preprocessing_pipeline.joblib') diff --git a/hashprep/__init__.py b/hashprep/__init__.py index dd2743f..231a028 100644 --- a/hashprep/__init__.py +++ b/hashprep/__init__.py @@ -1,3 +1,3 @@ -from .core.analyzer import DatasetAnalyzer +from .core.analyzer import DatasetAnalyzer as DatasetAnalyzer -__version__ = "0.1.0b1" \ No newline at end of file +__version__ = "0.1.0b1" diff --git a/hashprep/checks/__init__.py b/hashprep/checks/__init__.py index 094bc46..f44230f 100644 --- a/hashprep/checks/__init__.py +++ b/hashprep/checks/__init__.py @@ -1,29 +1,31 @@ -from typing import List, Optional - -from .core import Issue +from .columns import _check_duplicates, _check_high_cardinality, _check_mixed_data_types, _check_single_value_columns +from .core import Issue as Issue +from .correlations import calculate_correlations +from .distribution import _check_uniform_distribution, _check_unique_values from .drift import check_drift +from .imbalance import _check_class_imbalance from .leakage import _check_data_leakage, _check_target_leakage_patterns -from .missing_values import _check_high_missing_values, _check_empty_columns, _check_dataset_missingness, \ - _check_missing_patterns -from .columns import _check_single_value_columns, _check_high_cardinality, _check_duplicates, _check_mixed_data_types +from .missing_values import ( + _check_dataset_missingness, + _check_empty_columns, + _check_high_missing_values, + _check_missing_patterns, +) from .outliers import ( - _check_outliers, - _check_high_zero_counts, - _check_extreme_text_lengths, - _check_datetime_skew, - _check_skewness, - _check_infinite_values, _check_constant_length, + _check_datetime_skew, _check_empty_dataset, + _check_extreme_text_lengths, + _check_high_zero_counts, + _check_infinite_values, + _check_outliers, + _check_skewness, ) -from .correlations import calculate_correlations -from .imbalance import _check_class_imbalance -from .distribution import _check_uniform_distribution, _check_unique_values def _check_dataset_drift(analyzer): """Wrapper for drift detection that uses analyzer's comparison_df.""" - if hasattr(analyzer, 'comparison_df') and analyzer.comparison_df is not None: + if hasattr(analyzer, "comparison_df") and analyzer.comparison_df is not None: return check_drift(analyzer.df, analyzer.comparison_df) return [] @@ -56,7 +58,7 @@ def _check_dataset_drift(analyzer): CORRELATION_CHECKS = {"feature_correlation", "categorical_correlation", "mixed_correlation"} -def run_checks(analyzer, checks_to_run: List[str]): +def run_checks(analyzer, checks_to_run: list[str]): issues = [] correlation_requested = False @@ -70,4 +72,4 @@ def run_checks(analyzer, checks_to_run: List[str]): if correlation_requested: issues.extend(calculate_correlations(analyzer)) - return issues \ No newline at end of file + return issues diff --git a/hashprep/checks/columns.py b/hashprep/checks/columns.py index b37a5eb..55881cc 100644 --- a/hashprep/checks/columns.py +++ b/hashprep/checks/columns.py @@ -1,8 +1,9 @@ -from .core import Issue from ..config import DEFAULT_CONFIG +from .core import Issue _COL_THRESHOLDS = DEFAULT_CONFIG.columns + def _check_single_value_columns(analyzer): issues = [] for col in analyzer.df.columns: @@ -26,7 +27,12 @@ def _check_single_value_columns(analyzer): ) return issues -def _check_high_cardinality(analyzer, threshold: int = _COL_THRESHOLDS.high_cardinality_count, critical_threshold: float = _COL_THRESHOLDS.high_cardinality_ratio_critical): + +def _check_high_cardinality( + analyzer, + threshold: int = _COL_THRESHOLDS.high_cardinality_count, + critical_threshold: float = _COL_THRESHOLDS.high_cardinality_ratio_critical, +): issues = [] categorical_cols = analyzer.df.select_dtypes(include="object").columns.tolist() for col in categorical_cols: @@ -52,6 +58,7 @@ def _check_high_cardinality(analyzer, threshold: int = _COL_THRESHOLDS.high_card ) return issues + def _check_duplicates(analyzer): issues = [] duplicate_rows = int(analyzer.df.duplicated().sum()) @@ -76,6 +83,7 @@ def _check_duplicates(analyzer): ) return issues + def _check_mixed_data_types(analyzer): issues = [] for col in analyzer.df.columns: @@ -91,4 +99,4 @@ def _check_mixed_data_types(analyzer): quick_fix="Options: \n- Cast to single type: Ensure consistency (Pros: Simplifies processing; Cons: May lose nuance).\n- Split column: Separate types into new features (Pros: Preserves info; Cons: Adds complexity).\n- Investigate source: Check data collection errors (Pros: Improves quality; Cons: Time-consuming).", ) ) - return issues \ No newline at end of file + return issues diff --git a/hashprep/checks/core.py b/hashprep/checks/core.py index 6f73640..cb03481 100644 --- a/hashprep/checks/core.py +++ b/hashprep/checks/core.py @@ -1,9 +1,8 @@ from dataclasses import dataclass -@dataclass +@dataclass class Issue: - category: str severity: str # critical or warning diff --git a/hashprep/checks/correlations.py b/hashprep/checks/correlations.py index 4854403..0955ba3 100644 --- a/hashprep/checks/correlations.py +++ b/hashprep/checks/correlations.py @@ -1,17 +1,20 @@ -from .core import Issue -import pandas as pd -import numpy as np -from scipy.stats import spearmanr, pearsonr, kendalltau, chi2_contingency from itertools import combinations -from .discretizer import Discretizer, DiscretizationType -from ..utils.type_inference import is_usable_for_corr + +import numpy as np +import pandas as pd +from scipy.stats import chi2_contingency, kendalltau, pearsonr, spearmanr + from ..config import DEFAULT_CONFIG +from ..utils.type_inference import is_usable_for_corr +from .core import Issue +from .discretizer import DiscretizationType, Discretizer _CORR = DEFAULT_CONFIG.correlations CORR_THRESHOLDS = _CORR.as_nested_dict() CAT_MAX_DISTINCT = _CORR.max_distinct_categories LOW_CARD_NUM_THRESHOLD = _CORR.low_cardinality_numeric + def _cramers_v_corrected(table: pd.DataFrame) -> float: if table.empty or (table.shape[0] == 1 or table.shape[1] == 1): return 0.0 @@ -19,11 +22,11 @@ def _cramers_v_corrected(table: pd.DataFrame) -> float: n = table.sum().sum() phi2 = chi2 / n r, k = table.shape - with np.errstate(divide='ignore', invalid='ignore'): - phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1)) - rcorr = r - ((r-1)**2)/(n-1) - kcorr = k - ((k-1)**2)/(n-1) - rkcorr = min((kcorr-1), (rcorr-1)) + with np.errstate(divide="ignore", invalid="ignore"): + phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1)) + rcorr = r - ((r - 1) ** 2) / (n - 1) + kcorr = k - ((k - 1) ** 2) / (n - 1) + rkcorr = min((kcorr - 1), (rcorr - 1)) if rkcorr == 0: return 1.0 return np.sqrt(phi2corr / rkcorr) @@ -40,14 +43,20 @@ def calculate_correlations(analyzer, thresholds=None): inferred_types = analyzer.column_types # Use analyzer.column_types for inferred types dict issues = [] - numeric_cols = [col for col, typ in inferred_types.items() if - typ == 'Numeric' and is_usable_for_corr(analyzer.df[col])] - cat_cols = [col for col, typ in inferred_types.items() if typ == 'Categorical' and - 1 < analyzer.df[col].nunique() <= CAT_MAX_DISTINCT and is_usable_for_corr(analyzer.df[col])] - - issues.extend(_check_numeric_correlation(analyzer, numeric_cols, thresholds['numeric'])) - issues.extend(_check_categorical_correlation(analyzer, cat_cols, thresholds['categorical'])) - issues.extend(_check_mixed_correlation(analyzer, numeric_cols, cat_cols, thresholds['mixed'])) + numeric_cols = [ + col for col, typ in inferred_types.items() if typ == "Numeric" and is_usable_for_corr(analyzer.df[col]) + ] + cat_cols = [ + col + for col, typ in inferred_types.items() + if typ == "Categorical" + and 1 < analyzer.df[col].nunique() <= CAT_MAX_DISTINCT + and is_usable_for_corr(analyzer.df[col]) + ] + + issues.extend(_check_numeric_correlation(analyzer, numeric_cols, thresholds["numeric"])) + issues.extend(_check_categorical_correlation(analyzer, cat_cols, thresholds["categorical"])) + issues.extend(_check_mixed_correlation(analyzer, numeric_cols, cat_cols, thresholds["mixed"])) return issues @@ -57,7 +66,7 @@ def _check_numeric_correlation(analyzer, numeric_cols: list, thresholds: dict): if len(numeric_cols) < 2: return issues - num_df = analyzer.df[numeric_cols].dropna(how='all') + num_df = analyzer.df[numeric_cols].dropna(how="all") for col1, col2 in combinations(numeric_cols, 2): series1, series2 = num_df[col1].dropna(), num_df[col2].dropna() @@ -76,35 +85,38 @@ def _check_numeric_correlation(analyzer, numeric_cols: list, thresholds: dict): # Kendall (only for low-cardinality numerics) kendall_corr, kendall_p = None, None - is_low_card = (series1.nunique() <= LOW_CARD_NUM_THRESHOLD or - series2.nunique() <= LOW_CARD_NUM_THRESHOLD) + is_low_card = series1.nunique() <= LOW_CARD_NUM_THRESHOLD or series2.nunique() <= LOW_CARD_NUM_THRESHOLD if is_low_card: kendall_corr, kendall_p = kendalltau(series1, series2) kendall_corr = abs(kendall_corr) # Flag if any metric exceeds threshold - metrics = [('Spearman', spearman_corr, spearman_p, thresholds['spearman']), - ('Pearson', pearson_corr, pearson_p, thresholds['pearson'])] + metrics = [ + ("Spearman", spearman_corr, spearman_p, thresholds["spearman"]), + ("Pearson", pearson_corr, pearson_p, thresholds["pearson"]), + ] if kendall_corr is not None: - metrics.append(('Kendall', kendall_corr, kendall_p, thresholds['kendall'])) + metrics.append(("Kendall", kendall_corr, kendall_p, thresholds["kendall"])) for method, corr, p_val, thresh in metrics: - if corr > thresh['warning']: - severity = 'critical' if corr > thresh['critical'] else 'warning' - impact = 'high' if severity == 'critical' else 'medium' + if corr > thresh["warning"]: + severity = "critical" if corr > thresh["critical"] else "warning" + impact = "high" if severity == "critical" else "medium" quick_fix = ( f"Options: \n- Drop one feature (e.g., {col2}): Reduces multicollinearity.\n- PCA/combine: Retains info.\n- Use tree-based models." - if severity == 'critical' else - f"Options: \n- Monitor in modeling.\n- Drop if redundant." + if severity == "critical" + else "Options: \n- Monitor in modeling.\n- Drop if redundant." + ) + issues.append( + Issue( + category="feature_correlation", + severity=severity, + column=f"{col1},{col2}", + description=f"Numeric columns '{col1}' and '{col2}' highly correlated ({method}: {corr:.3f}, p={p_val:.4f})", + impact_score=impact, + quick_fix=quick_fix, + ) ) - issues.append(Issue( - category="feature_correlation", - severity=severity, - column=f"{col1},{col2}", - description=f"Numeric columns '{col1}' and '{col2}' highly correlated ({method}: {corr:.3f}, p={p_val:.4f})", - impact_score=impact, - quick_fix=quick_fix, - )) return issues @@ -117,22 +129,24 @@ def _check_categorical_correlation(analyzer, cat_cols: list, thresholds: dict): for col1, col2 in combinations(cat_cols, 2): table = pd.crosstab(analyzer.df[col1], analyzer.df[col2]) cramers_v = _cramers_v_corrected(table) - if cramers_v > thresholds['warning']: - severity = 'critical' if cramers_v > thresholds['critical'] else 'warning' - impact = 'high' if severity == 'critical' else 'medium' + if cramers_v > thresholds["warning"]: + severity = "critical" if cramers_v > thresholds["critical"] else "warning" + impact = "high" if severity == "critical" else "medium" quick_fix = ( "Options: \n- Drop one (less predictive). \n- Group categories. \n- Use trees (robust to assoc.)." - if severity == 'critical' else - "Options: \n- Monitor redundancy. \n- Re-encode." + if severity == "critical" + else "Options: \n- Monitor redundancy. \n- Re-encode." + ) + issues.append( + Issue( + category="feature_correlation", + severity=severity, + column=f"{col1},{col2}", + description=f"Categorical columns '{col1}' and '{col2}' highly associated (Cramer's V: {cramers_v:.3f})", + impact_score=impact, + quick_fix=quick_fix, + ) ) - issues.append(Issue( - category="feature_correlation", - severity=severity, - column=f"{col1},{col2}", - description=f"Categorical columns '{col1}' and '{col2}' highly associated (Cramer's V: {cramers_v:.3f})", - impact_score=impact, - quick_fix=quick_fix, - )) return issues @@ -147,21 +161,23 @@ def _check_mixed_correlation(analyzer, numeric_cols: list, cat_cols: list, thres for num_col, cat_col in [(n, c) for n in numeric_cols for c in cat_cols]: table = pd.crosstab(df_disc[cat_col], df_disc[num_col]) cramers_v = _cramers_v_corrected(table) - if cramers_v > thresholds['warning']: - severity = 'critical' if cramers_v > thresholds['critical'] else 'warning' - impact = 'high' if severity == 'critical' else 'medium' + if cramers_v > thresholds["warning"]: + severity = "critical" if cramers_v > thresholds["critical"] else "warning" + impact = "high" if severity == "critical" else "medium" quick_fix = ( "Options: \n- Drop one. \n- Discretize/encode differently. \n- Use robust models." - if severity == 'critical' else - "Options: \n- Monitor in modeling." + if severity == "critical" + else "Options: \n- Monitor in modeling." ) - issues.append(Issue( - category="feature_correlation", - severity=severity, - column=f"{cat_col},{num_col}", - description=f"Mixed columns '{cat_col}' (cat) and '{num_col}' (num) associated (Discretized Cramer's V: {cramers_v:.3f})", - impact_score=impact, - quick_fix=quick_fix, - )) - - return issues \ No newline at end of file + issues.append( + Issue( + category="feature_correlation", + severity=severity, + column=f"{cat_col},{num_col}", + description=f"Mixed columns '{cat_col}' (cat) and '{num_col}' (num) associated (Discretized Cramer's V: {cramers_v:.3f})", + impact_score=impact, + quick_fix=quick_fix, + ) + ) + + return issues diff --git a/hashprep/checks/discretizer.py b/hashprep/checks/discretizer.py index ade7064..889a303 100644 --- a/hashprep/checks/discretizer.py +++ b/hashprep/checks/discretizer.py @@ -1,11 +1,14 @@ -import pandas as pd -import numpy as np from enum import Enum +import numpy as np +import pandas as pd + + class DiscretizationType(Enum): UNIFORM = "uniform" QUANTILE = "quantile" + class Discretizer: def __init__(self, method=DiscretizationType.UNIFORM, n_bins=10): self.method = method diff --git a/hashprep/checks/distribution.py b/hashprep/checks/distribution.py index 82fa866..7446d5e 100644 --- a/hashprep/checks/distribution.py +++ b/hashprep/checks/distribution.py @@ -1,13 +1,12 @@ -from typing import List - from scipy.stats import kstest -from .core import Issue from ..config import DEFAULT_CONFIG +from .core import Issue _DIST = DEFAULT_CONFIG.distribution -def _check_uniform_distribution(analyzer, p_threshold: float = _DIST.uniform_p_value) -> List[Issue]: + +def _check_uniform_distribution(analyzer, p_threshold: float = _DIST.uniform_p_value) -> list[Issue]: """ Detect uniformly distributed numeric columns using Kolmogorov-Smirnov test. Uniform distributions often indicate synthetic IDs or sequential data. @@ -48,7 +47,7 @@ def _check_uniform_distribution(analyzer, p_threshold: float = _DIST.uniform_p_v return issues -def _check_unique_values(analyzer, threshold: float = _DIST.unique_value_ratio) -> List[Issue]: +def _check_unique_values(analyzer, threshold: float = _DIST.unique_value_ratio) -> list[Issue]: """ Detect columns where nearly all values are unique. High uniqueness often indicates identifiers, names, or free-text fields. diff --git a/hashprep/checks/drift.py b/hashprep/checks/drift.py index 4e5798b..332a4bc 100644 --- a/hashprep/checks/drift.py +++ b/hashprep/checks/drift.py @@ -2,9 +2,9 @@ import pandas as pd from scipy.stats import chisquare, ks_2samp -from .core import Issue from ..config import DEFAULT_CONFIG from ..utils.logging import get_logger +from .core import Issue _log = get_logger("checks.drift") @@ -88,7 +88,7 @@ def _check_categorical_drift( new_categories = set(test_counts.index) - set(train_counts.index) if new_categories: - sample_new = list(new_categories)[:_DRIFT.max_new_category_samples] + sample_new = list(new_categories)[: _DRIFT.max_new_category_samples] issues.append( Issue( category="dataset_drift", diff --git a/hashprep/checks/imbalance.py b/hashprep/checks/imbalance.py index 081ee62..19a2cd3 100644 --- a/hashprep/checks/imbalance.py +++ b/hashprep/checks/imbalance.py @@ -1,5 +1,6 @@ -from .core import Issue from ..config import DEFAULT_CONFIG +from .core import Issue + def _check_class_imbalance(analyzer, threshold: float = DEFAULT_CONFIG.imbalance.majority_class_ratio): issues = [] @@ -16,4 +17,4 @@ def _check_class_imbalance(analyzer, threshold: float = DEFAULT_CONFIG.imbalance quick_fix="Options: \n- Resample data: Use oversampling (e.g., SMOTE) or undersampling (Pros: Balances classes; Cons: May introduce bias or lose data).\n- Use class weights: Adjust model weights for imbalance (Pros: Simple; Cons: Model-dependent).\n- Stratified sampling: Ensure balanced splits in training (Pros: Improves evaluation; Cons: Requires careful implementation).", ) ) - return issues \ No newline at end of file + return issues diff --git a/hashprep/checks/leakage.py b/hashprep/checks/leakage.py index ecbbc1b..cdfa429 100644 --- a/hashprep/checks/leakage.py +++ b/hashprep/checks/leakage.py @@ -1,13 +1,15 @@ -from .core import Issue +import numpy as np import pandas as pd from scipy.stats import chi2_contingency, f_oneway -import numpy as np + from ..config import DEFAULT_CONFIG from ..utils.logging import get_logger +from .core import Issue _LEAK = DEFAULT_CONFIG.leakage _log = get_logger("checks.leakage") + def _check_data_leakage(analyzer): issues = [] if analyzer.target_col and analyzer.target_col in analyzer.df.columns: @@ -28,6 +30,7 @@ def _check_data_leakage(analyzer): ) return issues + def _check_target_leakage_patterns(analyzer): issues = [] if analyzer.target_col and analyzer.target_col in analyzer.df.columns: @@ -41,7 +44,11 @@ def _check_target_leakage_patterns(analyzer): corrs = numeric_cols.corrwith(target).abs() for col, corr in corrs.items(): severity = ( - "critical" if corr > _LEAK.numeric_critical else "warning" if corr > _LEAK.numeric_warning else None + "critical" + if corr > _LEAK.numeric_critical + else "warning" + if corr > _LEAK.numeric_warning + else None ) if severity: impact = "high" if severity == "critical" else "medium" @@ -62,9 +69,7 @@ def _check_target_leakage_patterns(analyzer): ) # Categorical target else: - cat_cols = analyzer.df.select_dtypes(include="object").drop( - columns=[analyzer.target_col], errors="ignore" - ) + cat_cols = analyzer.df.select_dtypes(include="object").drop(columns=[analyzer.target_col], errors="ignore") for col in cat_cols.columns: try: table = pd.crosstab(target, analyzer.df[col]) @@ -74,7 +79,11 @@ def _check_target_leakage_patterns(analyzer): r, k = table.shape cramers_v = np.sqrt(phi2 / min(k - 1, r - 1)) severity = ( - "critical" if cramers_v > _LEAK.categorical_critical else "warning" if cramers_v > _LEAK.categorical_warning else None + "critical" + if cramers_v > _LEAK.categorical_critical + else "warning" + if cramers_v > _LEAK.categorical_warning + else None ) if severity: impact = "high" if severity == "critical" else "medium" @@ -110,8 +119,11 @@ def _check_target_leakage_patterns(analyzer): try: f_stat, p_val = f_oneway(*groups) severity = ( - "critical" if f_stat > _LEAK.f_stat_critical and p_val < _LEAK.f_stat_p_value - else "warning" if f_stat > _LEAK.f_stat_warning and p_val < _LEAK.f_stat_p_value else None + "critical" + if f_stat > _LEAK.f_stat_critical and p_val < _LEAK.f_stat_p_value + else "warning" + if f_stat > _LEAK.f_stat_warning and p_val < _LEAK.f_stat_p_value + else None ) if severity: impact = "high" if severity == "critical" else "medium" @@ -133,4 +145,4 @@ def _check_target_leakage_patterns(analyzer): except (ValueError, RuntimeWarning) as e: _log.debug("F-test leakage check failed for '%s': %s", col, e) continue - return issues \ No newline at end of file + return issues diff --git a/hashprep/checks/missing_values.py b/hashprep/checks/missing_values.py index f407360..58e758c 100644 --- a/hashprep/checks/missing_values.py +++ b/hashprep/checks/missing_values.py @@ -1,17 +1,22 @@ -from scipy.stats import chi2_contingency, mannwhitneyu -from .core import Issue -import pandas as pd from collections import defaultdict + import numpy as np +import pandas as pd from numpy.linalg import LinAlgError +from scipy.stats import chi2_contingency, mannwhitneyu + from ..config import DEFAULT_CONFIG from ..utils.logging import get_logger +from .core import Issue _log = get_logger("checks.missing_values") _THRESHOLDS = DEFAULT_CONFIG.missing_values -def _check_high_missing_values(analyzer, threshold: float = _THRESHOLDS.warning, critical_threshold: float = _THRESHOLDS.critical): + +def _check_high_missing_values( + analyzer, threshold: float = _THRESHOLDS.warning, critical_threshold: float = _THRESHOLDS.critical +): issues = [] for col in analyzer.df.columns: missing_pct = float(analyzer.df[col].isna().mean()) @@ -35,6 +40,7 @@ def _check_high_missing_values(analyzer, threshold: float = _THRESHOLDS.warning, ) return issues + def _check_empty_columns(analyzer): issues = [] for col in analyzer.df.columns: @@ -51,11 +57,14 @@ def _check_empty_columns(analyzer): ) return issues -def _check_dataset_missingness(analyzer, threshold: float = _THRESHOLDS.dataset_warning_pct, critical_threshold: float = _THRESHOLDS.dataset_critical_pct): + +def _check_dataset_missingness( + analyzer, + threshold: float = _THRESHOLDS.dataset_warning_pct, + critical_threshold: float = _THRESHOLDS.dataset_critical_pct, +): issues = [] - missing_pct = float( - (analyzer.df.isnull().sum().sum() / (analyzer.df.shape[0] * analyzer.df.shape[1])) * 100 - ) + missing_pct = float((analyzer.df.isnull().sum().sum() / (analyzer.df.shape[0] * analyzer.df.shape[1])) * 100) if missing_pct > threshold: severity = "critical" if missing_pct > critical_threshold else "warning" impact = "high" if severity == "critical" else "medium" @@ -77,11 +86,16 @@ def _check_dataset_missingness(analyzer, threshold: float = _THRESHOLDS.dataset_ return issues -def _check_missing_patterns(analyzer, threshold: float = _THRESHOLDS.pattern_p_value, - critical_p_threshold: float = _THRESHOLDS.pattern_critical_p_value): +def _check_missing_patterns( + analyzer, + threshold: float = _THRESHOLDS.pattern_p_value, + critical_p_threshold: float = _THRESHOLDS.pattern_critical_p_value, +): issues = [] missing_cols = [ - col for col in analyzer.df.columns if int(analyzer.df[col].isna().sum()) >= _THRESHOLDS.pattern_min_missing_count + col + for col in analyzer.df.columns + if int(analyzer.df[col].isna().sum()) >= _THRESHOLDS.pattern_min_missing_count ] # grouping logic @@ -89,9 +103,7 @@ def _check_missing_patterns(analyzer, threshold: float = _THRESHOLDS.pattern_p_v num_patterns = defaultdict(list) # (missing_col, correlated_col, p_val, cohens_d) for col in missing_cols: - for other_col in analyzer.df.select_dtypes( - include=["object", "category"] - ).columns: + for other_col in analyzer.df.select_dtypes(include=["object", "category"]).columns: if col == other_col: continue try: @@ -125,19 +137,20 @@ def cramers_v(table): _log.debug("Chi-square test failed for '%s' vs '%s': %s", col, other_col, e) continue - for other_col in analyzer.df.select_dtypes( - include=["int64", "float64"] - ).columns: + for other_col in analyzer.df.select_dtypes(include=["int64", "float64"]).columns: if col == other_col: continue try: missing = analyzer.df[analyzer.df[col].isna()][other_col].dropna() non_missing = analyzer.df[analyzer.df[col].notna()][other_col].dropna() - if len(missing) < _THRESHOLDS.pattern_min_group_size or len(non_missing) < _THRESHOLDS.pattern_min_group_size: + if ( + len(missing) < _THRESHOLDS.pattern_min_group_size + or len(non_missing) < _THRESHOLDS.pattern_min_group_size + ): continue # Replaced f_oneway with mannwhitneyu - u_stat, p_val = mannwhitneyu(missing, non_missing, alternative='two-sided') + u_stat, p_val = mannwhitneyu(missing, non_missing, alternative="two-sided") # Cohen's d proxy as effect size pooled_std = np.sqrt((np.std(missing) ** 2 + np.std(non_missing) ** 2) / 2) @@ -160,7 +173,7 @@ def cramers_v(table): if all_patterns: # Sort by effect size (descending) and take top 3 all_patterns.sort(key=lambda x: x[2], reverse=True) # x[2] is effect size - top_corrs = [pat[0] for pat in all_patterns[:_THRESHOLDS.pattern_top_correlations]] + top_corrs = [pat[0] for pat in all_patterns[: _THRESHOLDS.pattern_top_correlations]] total_count = len(all_patterns) desc = f"Missingness in '{col}' correlates with {total_count} columns ({', '.join(top_corrs)})" @@ -170,7 +183,9 @@ def cramers_v(table): is_target_correlated = any(pat[0] == analyzer.target_col for pat in all_patterns) severity = ( "critical" - if p_val < critical_p_threshold and is_target_correlated and max_effect > _THRESHOLDS.pattern_effect_critical + if p_val < critical_p_threshold + and is_target_correlated + and max_effect > _THRESHOLDS.pattern_effect_critical else "warning" ) impact = "high" if severity == "critical" else "medium" @@ -191,4 +206,4 @@ def cramers_v(table): ) ) - return issues \ No newline at end of file + return issues diff --git a/hashprep/checks/outliers.py b/hashprep/checks/outliers.py index 9f79638..1cd3217 100644 --- a/hashprep/checks/outliers.py +++ b/hashprep/checks/outliers.py @@ -1,10 +1,12 @@ -from .core import Issue -import pandas as pd import numpy as np +import pandas as pd + from ..config import DEFAULT_CONFIG +from .core import Issue _THRESHOLDS = DEFAULT_CONFIG.outliers + def _check_outliers(analyzer, z_threshold: float = _THRESHOLDS.z_score): issues = [] for col in analyzer.df.select_dtypes(include="number").columns: @@ -34,7 +36,12 @@ def _check_outliers(analyzer, z_threshold: float = _THRESHOLDS.z_score): ) return issues -def _check_high_zero_counts(analyzer, threshold: float = _THRESHOLDS.zero_count_warning, critical_threshold: float = _THRESHOLDS.zero_count_critical): + +def _check_high_zero_counts( + analyzer, + threshold: float = _THRESHOLDS.zero_count_warning, + critical_threshold: float = _THRESHOLDS.zero_count_critical, +): issues = [] for col in analyzer.df.select_dtypes(include="number").columns: series = analyzer.df[col].dropna() @@ -61,7 +68,10 @@ def _check_high_zero_counts(analyzer, threshold: float = _THRESHOLDS.zero_count_ ) return issues -def _check_extreme_text_lengths(analyzer, max_threshold: int = _THRESHOLDS.text_length_max, min_threshold: int = _THRESHOLDS.text_length_min): + +def _check_extreme_text_lengths( + analyzer, max_threshold: int = _THRESHOLDS.text_length_max, min_threshold: int = _THRESHOLDS.text_length_min +): issues = [] for col in analyzer.df.select_dtypes(include="object").columns: series = analyzer.df[col].dropna().astype(str) @@ -69,9 +79,7 @@ def _check_extreme_text_lengths(analyzer, max_threshold: int = _THRESHOLDS.text_ continue lengths = series.str.len() if lengths.max() > max_threshold or lengths.min() < min_threshold: - extreme_ratio = float( - ((lengths > max_threshold) | (lengths < min_threshold)).mean() - ) + extreme_ratio = float(((lengths > max_threshold) | (lengths < min_threshold)).mean()) severity = "critical" if extreme_ratio > _THRESHOLDS.extreme_ratio_critical else "warning" impact = "high" if severity == "critical" else "medium" quick_fix = ( @@ -91,7 +99,12 @@ def _check_extreme_text_lengths(analyzer, max_threshold: int = _THRESHOLDS.text_ ) return issues -def _check_skewness(analyzer, skew_threshold: float = _THRESHOLDS.skewness_warning, critical_skew_threshold: float = _THRESHOLDS.skewness_critical): + +def _check_skewness( + analyzer, + skew_threshold: float = _THRESHOLDS.skewness_warning, + critical_skew_threshold: float = _THRESHOLDS.skewness_critical, +): issues = [] for col in analyzer.df.select_dtypes(include="number").columns: series = analyzer.df[col].dropna() @@ -99,7 +112,7 @@ def _check_skewness(analyzer, skew_threshold: float = _THRESHOLDS.skewness_warni continue skewness = float(series.skew()) abs_skew = abs(skewness) - + if abs_skew > skew_threshold: severity = "critical" if abs_skew > critical_skew_threshold else "warning" impact = "high" if severity == "critical" else "medium" @@ -120,6 +133,7 @@ def _check_skewness(analyzer, skew_threshold: float = _THRESHOLDS.skewness_warni ) return issues + def _check_datetime_skew(analyzer, threshold: float = _THRESHOLDS.datetime_skew): issues = [] for col in analyzer.df.select_dtypes(include="datetime64").columns: @@ -224,4 +238,4 @@ def _check_empty_dataset(analyzer): quick_fix="All data is missing. Check data extraction and verify the source.", ) ) - return issues \ No newline at end of file + return issues diff --git a/hashprep/config.py b/hashprep/config.py index 9c73c00..d62d3d9 100644 --- a/hashprep/config.py +++ b/hashprep/config.py @@ -7,12 +7,12 @@ """ from dataclasses import dataclass, field -from typing import Dict @dataclass(frozen=True) class MissingValueThresholds: """Thresholds for missing value detection.""" + warning: float = 0.4 critical: float = 0.7 dataset_warning_pct: float = 20.0 @@ -31,6 +31,7 @@ class MissingValueThresholds: @dataclass(frozen=True) class OutlierThresholds: """Thresholds for outlier detection.""" + z_score: float = 4.0 outlier_ratio_critical: float = 0.1 zero_count_warning: float = 0.5 @@ -49,6 +50,7 @@ class OutlierThresholds: @dataclass(frozen=True) class ColumnThresholds: """Thresholds for column-level checks.""" + high_cardinality_count: int = 100 high_cardinality_ratio_critical: float = 0.9 duplicate_ratio_critical: float = 0.1 @@ -57,6 +59,7 @@ class ColumnThresholds: @dataclass(frozen=True) class CorrelationThresholds: """Thresholds for correlation analysis.""" + spearman_warning: float = 0.7 spearman_critical: float = 0.95 pearson_warning: float = 0.7 @@ -73,19 +76,20 @@ class CorrelationThresholds: def as_nested_dict(self) -> dict: """Return thresholds in the nested dict format used by correlation checks.""" return { - 'numeric': { - 'spearman': {'warning': self.spearman_warning, 'critical': self.spearman_critical}, - 'pearson': {'warning': self.pearson_warning, 'critical': self.pearson_critical}, - 'kendall': {'warning': self.kendall_warning, 'critical': self.kendall_critical}, + "numeric": { + "spearman": {"warning": self.spearman_warning, "critical": self.spearman_critical}, + "pearson": {"warning": self.pearson_warning, "critical": self.pearson_critical}, + "kendall": {"warning": self.kendall_warning, "critical": self.kendall_critical}, }, - 'categorical': {'warning': self.categorical_warning, 'critical': self.categorical_critical}, - 'mixed': {'warning': self.mixed_warning, 'critical': self.mixed_critical}, + "categorical": {"warning": self.categorical_warning, "critical": self.categorical_critical}, + "mixed": {"warning": self.mixed_warning, "critical": self.mixed_critical}, } @dataclass(frozen=True) class LeakageThresholds: """Thresholds for data leakage detection.""" + numeric_critical: float = 0.98 numeric_warning: float = 0.95 categorical_critical: float = 0.95 @@ -98,6 +102,7 @@ class LeakageThresholds: @dataclass(frozen=True) class DriftThresholds: """Thresholds for dataset drift detection.""" + p_value: float = 0.05 critical_p_value: float = 0.001 max_categories_for_chi2: int = 50 @@ -107,6 +112,7 @@ class DriftThresholds: @dataclass(frozen=True) class DistributionThresholds: """Thresholds for distribution checks.""" + uniform_p_value: float = 0.1 uniform_min_samples: int = 20 unique_value_ratio: float = 0.95 @@ -116,25 +122,33 @@ class DistributionThresholds: @dataclass(frozen=True) class ImbalanceThresholds: """Thresholds for class imbalance detection.""" + majority_class_ratio: float = 0.9 @dataclass(frozen=True) class TypeInferenceConfig: """Configuration for type inference.""" + cat_cardinality_threshold: int = 50 cat_percentage_threshold: float = 0.05 num_low_cat_threshold: int = 10 - bool_mappings: Dict[str, bool] = field(default_factory=lambda: { - 'true': True, 'false': False, - 'yes': True, 'no': False, - 't': True, 'f': False, - }) + bool_mappings: dict[str, bool] = field( + default_factory=lambda: { + "true": True, + "false": False, + "yes": True, + "no": False, + "t": True, + "f": False, + } + ) @dataclass(frozen=True) class SamplingDefaults: """Default values for dataset sampling.""" + max_rows: int = 100_000 memory_threshold_mb: float = 500.0 @@ -142,6 +156,7 @@ class SamplingDefaults: @dataclass(frozen=True) class SummaryDefaults: """Defaults for summary generation.""" + histogram_bins: int = 10 top_n_values: int = 10 extreme_values_count: int = 10 @@ -151,6 +166,7 @@ class SummaryDefaults: @dataclass(frozen=True) class HashPrepConfig: """Root configuration aggregating all threshold groups.""" + missing_values: MissingValueThresholds = field(default_factory=MissingValueThresholds) outliers: OutlierThresholds = field(default_factory=OutlierThresholds) columns: ColumnThresholds = field(default_factory=ColumnThresholds) diff --git a/hashprep/core/analyzer.py b/hashprep/core/analyzer.py index bdbbc59..31e4161 100644 --- a/hashprep/core/analyzer.py +++ b/hashprep/core/analyzer.py @@ -1,7 +1,6 @@ import time import warnings from datetime import datetime -from typing import Dict, List, Optional import pandas as pd from scipy.stats import ConstantInputWarning @@ -62,11 +61,11 @@ class DatasetAnalyzer: def __init__( self, df: pd.DataFrame, - target_col: Optional[str] = None, - selected_checks: Optional[List[str]] = None, + target_col: str | None = None, + selected_checks: list[str] | None = None, include_plots: bool = False, - comparison_df: Optional[pd.DataFrame] = None, - sampling_config: Optional[SamplingConfig] = None, + comparison_df: pd.DataFrame | None = None, + sampling_config: SamplingConfig | None = None, auto_sample: bool = True, ): if not isinstance(df, pd.DataFrame): @@ -82,10 +81,10 @@ def __init__( self.target_col = target_col self.selected_checks = selected_checks self.include_plots = include_plots - self.issues: List = [] - self.summaries: Dict = {} + self.issues: list = [] + self.summaries: dict = {} - self.sampler: Optional[DatasetSampler] = None + self.sampler: DatasetSampler | None = None if auto_sample: self.sampler = DatasetSampler(sampling_config) if self.sampler.should_sample(df): @@ -100,10 +99,10 @@ def __init__( self.column_types = infer_types(self.df) - def analyze(self) -> Dict: + def analyze(self) -> dict: """Run all summaries and checks, return summary.""" # Suppress scipy warnings about constant input arrays - warnings.filterwarnings('ignore', category=ConstantInputWarning) + warnings.filterwarnings("ignore", category=ConstantInputWarning) analysis_start = datetime.now() start_time = time.time() @@ -114,12 +113,8 @@ def analyze(self) -> Dict: duplicate_info = get_duplicate_info(self.df) self.summaries["dataset_info"].update(duplicate_info) - self.summaries["variable_types"] = summarize_variable_types( - self.df, column_types=self.column_types - ) - self.summaries["variable_type_counts"] = summarize_variable_type_counts( - self.df, column_types=self.column_types - ) + self.summaries["variable_types"] = summarize_variable_types(self.df, column_types=self.column_types) + self.summaries["variable_type_counts"] = summarize_variable_type_counts(self.df, column_types=self.column_types) self.summaries["reproduction_info"] = add_reproduction_info(self.df) self.summaries["variables"] = summarize_variables(self.df, column_types=self.column_types) self.summaries.update(summarize_interactions(self.df)) @@ -151,22 +146,16 @@ def _generate_plots(self): plots = {} if stats["category"] == "Numeric": if stats["histogram"]["counts"]: - plots["histogram"] = plot_histogram( - self.df[col].dropna(), f"Histogram of {col}" - ) + plots["histogram"] = plot_histogram(self.df[col].dropna(), f"Histogram of {col}") elif stats["category"] in ["Categorical", "Boolean"]: if stats["categories"].get("common_values"): series = self.df[col].dropna().astype(str).value_counts().head(10) - plots["common_values_bar"] = plot_bar( - series, f"Top Values of {col}", col, "Count" - ) + plots["common_values_bar"] = plot_bar(series, f"Top Values of {col}", col, "Count") elif stats["category"] == "Text": if stats["words"]: word_counts = {w: d["count"] for w, d in stats["words"].items()} series = pd.Series(word_counts).head(10) - plots["word_bar"] = plot_bar( - series, f"Top Words in {col}", "Words", "Count" - ) + plots["word_bar"] = plot_bar(series, f"Top Words in {col}", "Words", "Count") stats["plots"] = plots @@ -178,8 +167,8 @@ def _generate_plots(self): for method in ["pearson", "spearman", "kendall"]: corr = numeric_df.corr(method=method) - self.summaries["numeric_correlations"]["plots"][method] = ( - plot_heatmap(corr, f"{method.capitalize()} Correlation") + self.summaries["numeric_correlations"]["plots"][method] = plot_heatmap( + corr, f"{method.capitalize()} Correlation" ) pairs = self.summaries.get("scatter_pairs", []) diff --git a/hashprep/core/visualizations.py b/hashprep/core/visualizations.py index c38280f..eda1f6a 100644 --- a/hashprep/core/visualizations.py +++ b/hashprep/core/visualizations.py @@ -1,22 +1,24 @@ +import base64 +import io + import matplotlib.pyplot as plt -import seaborn as sns import pandas as pd -import io -import base64 -from typing import Dict, Optional, List, Any +import seaborn as sns # Set style -plt.style.use('ggplot') +plt.style.use("ggplot") sns.set_palette("husl") + def _fig_to_base64(fig) -> str: buf = io.BytesIO() - fig.savefig(buf, format='png', bbox_inches='tight') + fig.savefig(buf, format="png", bbox_inches="tight") buf.seek(0) - data = base64.b64encode(buf.read()).decode('utf-8') + data = base64.b64encode(buf.read()).decode("utf-8") plt.close(fig) return data + def plot_histogram(series: pd.Series, title: str) -> str: fig, ax = plt.subplots(figsize=(4, 3)) sns.histplot(series, bins=10, ax=ax) @@ -25,6 +27,7 @@ def plot_histogram(series: pd.Series, title: str) -> str: ax.set_ylabel("Count") return _fig_to_base64(fig) + def plot_bar(series: pd.Series, title: str, xlabel: str, ylabel: str) -> str: fig, ax = plt.subplots(figsize=(4, 3)) series.plot(kind="bar", ax=ax) @@ -34,12 +37,14 @@ def plot_bar(series: pd.Series, title: str, xlabel: str, ylabel: str) -> str: plt.xticks(rotation=45, ha="right") return _fig_to_base64(fig) + def plot_heatmap(corr_matrix: pd.DataFrame, title: str, vmin: float = -1, vmax: float = 1) -> str: fig, ax = plt.subplots(figsize=(5, 4)) sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", vmin=vmin, vmax=vmax, ax=ax) ax.set_title(title) return _fig_to_base64(fig) + def plot_scatter(df: pd.DataFrame, x: str, y: str) -> str: fig, ax = plt.subplots(figsize=(4, 3)) sns.scatterplot(data=df, x=x, y=y, ax=ax) @@ -48,6 +53,7 @@ def plot_scatter(df: pd.DataFrame, x: str, y: str) -> str: ax.set_ylabel(y) return _fig_to_base64(fig) + def plot_missing_bar(missing_data: pd.Series) -> str: if missing_data.sum() == 0: return "" @@ -59,6 +65,7 @@ def plot_missing_bar(missing_data: pd.Series) -> str: plt.xticks(rotation=45, ha="right") return _fig_to_base64(fig) + def plot_missing_heatmap(df: pd.DataFrame) -> str: fig, ax = plt.subplots(figsize=(5, 3)) sns.heatmap(df.isnull(), cbar=False, cmap="viridis", ax=ax) diff --git a/hashprep/interfaces/cli/main.py b/hashprep/interfaces/cli/main.py index 6083aae..d5e216f 100644 --- a/hashprep/interfaces/cli/main.py +++ b/hashprep/interfaces/cli/main.py @@ -30,13 +30,7 @@ def json_numpy_handler(obj): def suggest_check_names(invalid_check, valid_checks, cutoff=0.4): """Suggest similar check names for an invalid check using fuzzybunny.""" # Use fuzzybunny to find the top 3 most similar check names - results = fuzzybunny.rank( - invalid_check, - valid_checks, - scorer='levenshtein', - threshold=cutoff, - top_n=3 - ) + results = fuzzybunny.rank(invalid_check, valid_checks, scorer="levenshtein", threshold=cutoff, top_n=3) # Extract just the matched strings from the results suggestions = [match[0] for match in results] return suggestions @@ -76,9 +70,7 @@ def version(): help="Max rows for sampling (default: 100000)", ) @click.option("--no-sample", is_flag=True, help="Disable automatic sampling") -def scan( - file_path, critical_only, quiet, json_out, target, checks, comparison, sample_size, no_sample -): +def scan(file_path, critical_only, quiet, json_out, target, checks, comparison, sample_size, no_sample): df = pd.read_csv(file_path) comparison_df = pd.read_csv(comparison) if comparison else None @@ -135,9 +127,7 @@ def scan( if "sampling_info" in summary and summary["sampling_info"].get("was_sampled"): info = summary["sampling_info"] - click.echo( - f"Sampled: {info['sample_fraction']*100:.1f}% of {info['original_rows']} rows" - ) + click.echo(f"Sampled: {info['sample_fraction'] * 100:.1f}% of {info['original_rows']} rows") if critical_only: click.echo("Critical Issues:") @@ -214,7 +204,7 @@ def details(file_path, target, checks, comparison, sample_size, no_sample): if "sampling_info" in summary and summary["sampling_info"].get("was_sampled"): info = summary["sampling_info"] click.echo( - f"Note: Analysis performed on {info['sample_fraction']*100:.1f}% sample ({int(info['original_rows'] * info['sample_fraction'])} of {info['original_rows']} rows)" + f"Note: Analysis performed on {info['sample_fraction'] * 100:.1f}% sample ({int(info['original_rows'] * info['sample_fraction'])} of {info['original_rows']} rows)" ) click.echo("\nCritical Issues:") @@ -259,9 +249,7 @@ def details(file_path, target, checks, comparison, sample_size, no_sample): @cli.command() @click.argument("file_path", type=click.Path(exists=True)) @click.option("--with-code", is_flag=True, help="Generate fixes.py and pipeline.py scripts") -@click.option( - "--full/--no-full", default=True, help="Include full summaries in report (default: True)" -) +@click.option("--full/--no-full", default=True, help="Include full summaries in report (default: True)") @click.option("--format", default="md", help="Report format: md, json, html, pdf") @click.option("--theme", default="minimal", help="HTML report theme: minimal, neubrutalism") @click.option("--target", default=None, help="Target column for relevant checks") @@ -343,15 +331,11 @@ def report( theme=theme, ) click.echo(f"Report saved to: {report_file}") - click.echo( - f"Summary: {summary['critical_count']} critical, {summary['warning_count']} warnings" - ) + click.echo(f"Summary: {summary['critical_count']} critical, {summary['warning_count']} warnings") if "sampling_info" in summary and summary["sampling_info"].get("was_sampled"): info = summary["sampling_info"] - click.echo( - f"Note: Analysis performed on {info['sample_fraction']*100:.1f}% sample" - ) + click.echo(f"Note: Analysis performed on {info['sample_fraction'] * 100:.1f}% sample") if with_code: issues = [Issue(**i) for i in summary["issues"]] diff --git a/hashprep/preparers/codegen.py b/hashprep/preparers/codegen.py index 02d4d23..14cb819 100644 --- a/hashprep/preparers/codegen.py +++ b/hashprep/preparers/codegen.py @@ -1,5 +1,3 @@ -from typing import Dict, List, Set - from .models import FixSuggestion, FixType from .strategies import ( ColumnDropStrategy, @@ -16,7 +14,7 @@ class CodeGenerator: """Generates executable Python code from fix suggestions.""" - STRATEGY_MAP: Dict[FixType, FixStrategy] = { + STRATEGY_MAP: dict[FixType, FixStrategy] = { FixType.DROP_COLUMN: ColumnDropStrategy(), FixType.DROP_DUPLICATES: DuplicateRemovalStrategy(), FixType.IMPUTE: ImputationStrategy(), @@ -26,13 +24,13 @@ class CodeGenerator: FixType.CLIP_OUTLIERS: OutlierStrategy(), } - def __init__(self, suggestions: List[FixSuggestion]): + def __init__(self, suggestions: list[FixSuggestion]): self.suggestions = suggestions def generate_pandas_script(self) -> str: """Generate a complete, runnable pandas script.""" imports = self._collect_imports() - code_blocks: List[str] = [] + code_blocks: list[str] = [] code_blocks.append('"""') code_blocks.append("Auto-generated data cleaning script by HashPrep.") @@ -82,9 +80,9 @@ def _generate_code_for_suggestion(self, suggestion: FixSuggestion) -> str: return strategy.generate_pandas_code(suggestion) return "" - def _collect_imports(self) -> List[str]: + def _collect_imports(self) -> list[str]: """Collect all required imports.""" - imports: Set[str] = {"import pandas as pd", "import numpy as np"} + imports: set[str] = {"import pandas as pd", "import numpy as np"} for suggestion in self.suggestions: strategy = self.STRATEGY_MAP.get(suggestion.fix_type) diff --git a/hashprep/preparers/fix_registry.py b/hashprep/preparers/fix_registry.py index 0100091..5bbee4f 100644 --- a/hashprep/preparers/fix_registry.py +++ b/hashprep/preparers/fix_registry.py @@ -1,4 +1,4 @@ -from typing import Callable, Dict, List, Optional +from collections.abc import Callable from ..checks.core import Issue from .models import ( @@ -6,7 +6,6 @@ FixSuggestion, FixType, ImputeMethod, - ScaleMethod, TransformMethod, ) @@ -16,15 +15,15 @@ class FixRegistry: def __init__( self, - column_types: Dict[str, str], - target_col: Optional[str] = None, - column_stats: Optional[Dict[str, Dict]] = None, + column_types: dict[str, str], + target_col: str | None = None, + column_stats: dict[str, dict] | None = None, ): self.column_types = column_types self.target_col = target_col self.column_stats = column_stats or {} - self._handlers: Dict[str, Callable[[Issue], List[FixSuggestion]]] = { + self._handlers: dict[str, Callable[[Issue], list[FixSuggestion]]] = { "missing_values": self._suggest_missing_fix, "high_missing_values": self._suggest_missing_fix, "empty_column": self._suggest_drop, @@ -42,7 +41,7 @@ def __init__( "feature_correlation": self._suggest_drop_correlated, } - def get_suggestions(self, issue: Issue) -> List[FixSuggestion]: + def get_suggestions(self, issue: Issue) -> list[FixSuggestion]: """Get fix suggestions for an issue.""" handler = self._handlers.get(issue.category) if handler: @@ -64,7 +63,7 @@ def _get_missing_pct(self, issue: Issue) -> float: pass return 50.0 - def _suggest_missing_fix(self, issue: Issue) -> List[FixSuggestion]: + def _suggest_missing_fix(self, issue: Issue) -> list[FixSuggestion]: col = issue.column col_type = self._get_column_type(col) missing_pct = self._get_missing_pct(issue) @@ -115,7 +114,7 @@ def _suggest_missing_fix(self, issue: Issue) -> List[FixSuggestion]: ) ] - def _suggest_drop(self, issue: Issue) -> List[FixSuggestion]: + def _suggest_drop(self, issue: Issue) -> list[FixSuggestion]: return [ FixSuggestion( fix_type=FixType.DROP_COLUMN, @@ -126,7 +125,7 @@ def _suggest_drop(self, issue: Issue) -> List[FixSuggestion]: ) ] - def _suggest_drop_with_warning(self, issue: Issue) -> List[FixSuggestion]: + def _suggest_drop_with_warning(self, issue: Issue) -> list[FixSuggestion]: return [ FixSuggestion( fix_type=FixType.DROP_COLUMN, @@ -137,7 +136,7 @@ def _suggest_drop_with_warning(self, issue: Issue) -> List[FixSuggestion]: ) ] - def _suggest_encoding(self, issue: Issue) -> List[FixSuggestion]: + def _suggest_encoding(self, issue: Issue) -> list[FixSuggestion]: col = issue.column desc = issue.description.lower() @@ -185,7 +184,7 @@ def _suggest_encoding(self, issue: Issue) -> List[FixSuggestion]: ) ] - def _suggest_dedupe(self, issue: Issue) -> List[FixSuggestion]: + def _suggest_dedupe(self, issue: Issue) -> list[FixSuggestion]: return [ FixSuggestion( fix_type=FixType.DROP_DUPLICATES, @@ -197,7 +196,7 @@ def _suggest_dedupe(self, issue: Issue) -> List[FixSuggestion]: ) ] - def _suggest_outlier_fix(self, issue: Issue) -> List[FixSuggestion]: + def _suggest_outlier_fix(self, issue: Issue) -> list[FixSuggestion]: return [ FixSuggestion( fix_type=FixType.CLIP_OUTLIERS, @@ -209,7 +208,7 @@ def _suggest_outlier_fix(self, issue: Issue) -> List[FixSuggestion]: ) ] - def _suggest_transform(self, issue: Issue) -> List[FixSuggestion]: + def _suggest_transform(self, issue: Issue) -> list[FixSuggestion]: col = issue.column desc = issue.description.lower() @@ -250,7 +249,7 @@ def _suggest_transform(self, issue: Issue) -> List[FixSuggestion]: ) ] - def _suggest_drop_correlated(self, issue: Issue) -> List[FixSuggestion]: + def _suggest_drop_correlated(self, issue: Issue) -> list[FixSuggestion]: col = issue.column return [ FixSuggestion( diff --git a/hashprep/preparers/models.py b/hashprep/preparers/models.py index 1def025..1d96e06 100644 --- a/hashprep/preparers/models.py +++ b/hashprep/preparers/models.py @@ -1,6 +1,6 @@ from dataclasses import dataclass, field from enum import Enum -from typing import Any, Dict, List, Optional +from typing import Any class FixType(Enum): @@ -59,9 +59,9 @@ class FixSuggestion: """Structured representation of a data fix action.""" fix_type: FixType - columns: List[str] - method: Optional[str] = None - parameters: Dict[str, Any] = field(default_factory=dict) + columns: list[str] + method: str | None = None + parameters: dict[str, Any] = field(default_factory=dict) priority: int = 0 reason: str = "" source_issue_category: str = "" diff --git a/hashprep/preparers/pipeline_builder.py b/hashprep/preparers/pipeline_builder.py index 0783403..faa58b0 100644 --- a/hashprep/preparers/pipeline_builder.py +++ b/hashprep/preparers/pipeline_builder.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional, Set, Tuple +from typing import Any from .models import FixSuggestion, FixType from .strategies import ( @@ -16,7 +16,7 @@ class PipelineBuilder: Generates both code and actual pipeline objects. """ - STRATEGY_MAP: Dict[FixType, Any] = { + STRATEGY_MAP: dict[FixType, Any] = { FixType.DROP_COLUMN: ColumnDropStrategy(), FixType.IMPUTE: ImputationStrategy(), FixType.ENCODE: EncodingStrategy(), @@ -24,7 +24,7 @@ class PipelineBuilder: FixType.TRANSFORM: TransformationStrategy(), } - def __init__(self, suggestions: List[FixSuggestion]): + def __init__(self, suggestions: list[FixSuggestion]): self.suggestions = suggestions self._validate_suggestions() @@ -38,7 +38,7 @@ def _validate_suggestions(self) -> None: def generate_pipeline_code(self) -> str: """Generate sklearn pipeline code as a string.""" - code: List[str] = [] + code: list[str] = [] code.append('"""') code.append("Auto-generated sklearn preprocessing pipeline by HashPrep.") @@ -111,9 +111,9 @@ def generate_pipeline_code(self) -> str: return "\n".join(code) - def _collect_all_imports(self) -> List[str]: + def _collect_all_imports(self) -> list[str]: """Collect all required imports for the pipeline.""" - imports: Set[str] = { + imports: set[str] = { "from sklearn.pipeline import Pipeline", "from sklearn.compose import ColumnTransformer", "import numpy as np", @@ -128,10 +128,10 @@ def _collect_all_imports(self) -> List[str]: return sorted(imports) - def _build_transformer_list(self) -> List[Tuple[str, str, List[str]]]: + def _build_transformer_list(self) -> list[tuple[str, str, list[str]]]: """Build list of (name, transformer_code, columns) tuples.""" - transformers: List[Tuple[str, str, List[str]]] = [] - seen_names: Set[str] = set() + transformers: list[tuple[str, str, list[str]]] = [] + seen_names: set[str] = set() for suggestion in self.suggestions: if suggestion.parameters.get("pre_pipeline"): @@ -160,7 +160,7 @@ def _build_transformer_list(self) -> List[Tuple[str, str, List[str]]]: return transformers - def build_pipeline_object(self) -> Optional[Any]: + def build_pipeline_object(self) -> Any | None: """ Return an actual sklearn Pipeline object. Can be serialized with joblib. @@ -172,7 +172,7 @@ def build_pipeline_object(self) -> Optional[Any]: return None transformers = [] - seen_names: Set[str] = set() + seen_names: set[str] = set() for suggestion in self.suggestions: if suggestion.parameters.get("pre_pipeline"): @@ -207,7 +207,7 @@ def build_pipeline_object(self) -> Optional[Any]: return Pipeline([("preprocessor", preprocessor)]) - def _get_transformer_instance(self, suggestion: FixSuggestion) -> Optional[Any]: + def _get_transformer_instance(self, suggestion: FixSuggestion) -> Any | None: """Return actual transformer instance for a suggestion.""" try: from sklearn.impute import KNNImputer, SimpleImputer @@ -261,9 +261,7 @@ def _get_transformer_instance(self, suggestion: FixSuggestion) -> Optional[Any]: if method == "onehot": return OneHotEncoder(handle_unknown="ignore", sparse_output=False) if method in ("ordinal", "label"): - return OrdinalEncoder( - handle_unknown="use_encoded_value", unknown_value=-1 - ) + return OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1) return None if fix_type == FixType.TRANSFORM: @@ -274,8 +272,6 @@ def _get_transformer_instance(self, suggestion: FixSuggestion) -> Optional[Any]: if method == "log1p": return FunctionTransformer(np.log1p, validate=True) if method == "sqrt": - return FunctionTransformer( - lambda x: np.sqrt(np.clip(x, 0, None)), validate=True - ) + return FunctionTransformer(lambda x: np.sqrt(np.clip(x, 0, None)), validate=True) return None diff --git a/hashprep/preparers/strategies/__init__.py b/hashprep/preparers/strategies/__init__.py index 5c9c0dc..0bc31be 100644 --- a/hashprep/preparers/strategies/__init__.py +++ b/hashprep/preparers/strategies/__init__.py @@ -1,10 +1,10 @@ from .base import FixStrategy -from .imputation import ImputationStrategy +from .column_ops import ColumnDropStrategy, DuplicateRemovalStrategy from .encoding import EncodingStrategy +from .imputation import ImputationStrategy +from .outlier import OutlierStrategy from .scaling import ScalingStrategy from .transformation import TransformationStrategy -from .outlier import OutlierStrategy -from .column_ops import ColumnDropStrategy, DuplicateRemovalStrategy __all__ = [ "FixStrategy", diff --git a/hashprep/preparers/strategies/base.py b/hashprep/preparers/strategies/base.py index 06c194d..9b6d6b0 100644 --- a/hashprep/preparers/strategies/base.py +++ b/hashprep/preparers/strategies/base.py @@ -1,5 +1,4 @@ from abc import ABC, abstractmethod -from typing import List, Optional, Tuple from ..models import FixSuggestion @@ -7,7 +6,7 @@ class FixStrategy(ABC): """Base class for all fix strategies.""" - SKLEARN_IMPORTS: List[str] = [] + SKLEARN_IMPORTS: list[str] = [] @abstractmethod def generate_pandas_code(self, suggestion: FixSuggestion) -> str: @@ -15,20 +14,18 @@ def generate_pandas_code(self, suggestion: FixSuggestion) -> str: pass @abstractmethod - def get_sklearn_transformer( - self, suggestion: FixSuggestion - ) -> Tuple[Optional[str], List[str]]: + def get_sklearn_transformer(self, suggestion: FixSuggestion) -> tuple[str | None, list[str]]: """ Return (transformer_instance_code, column_list) for sklearn pipeline. Returns (None, []) if not applicable to sklearn pipelines. """ pass - def get_sklearn_imports(self) -> List[str]: + def get_sklearn_imports(self) -> list[str]: """Return required sklearn import statements.""" return self.SKLEARN_IMPORTS - def _format_column_list(self, columns: List[str]) -> str: + def _format_column_list(self, columns: list[str]) -> str: """Format column list as Python literal.""" if len(columns) == 1: return f"['{columns[0]}']" diff --git a/hashprep/preparers/strategies/column_ops.py b/hashprep/preparers/strategies/column_ops.py index 87ec891..0681ed8 100644 --- a/hashprep/preparers/strategies/column_ops.py +++ b/hashprep/preparers/strategies/column_ops.py @@ -1,5 +1,3 @@ -from typing import List, Optional, Tuple - from ..models import FixSuggestion from .base import FixStrategy @@ -7,28 +5,24 @@ class ColumnDropStrategy(FixStrategy): """Strategy for dropping columns.""" - SKLEARN_IMPORTS: List[str] = [] + SKLEARN_IMPORTS: list[str] = [] def generate_pandas_code(self, suggestion: FixSuggestion) -> str: cols = self._format_column_list(suggestion.columns) return f"df = df.drop(columns={cols})" - def get_sklearn_transformer( - self, suggestion: FixSuggestion - ) -> Tuple[Optional[str], List[str]]: + def get_sklearn_transformer(self, suggestion: FixSuggestion) -> tuple[str | None, list[str]]: return "'drop'", suggestion.columns class DuplicateRemovalStrategy(FixStrategy): """Strategy for removing duplicate rows.""" - SKLEARN_IMPORTS: List[str] = [] + SKLEARN_IMPORTS: list[str] = [] def generate_pandas_code(self, suggestion: FixSuggestion) -> str: keep = suggestion.parameters.get("keep", "first") return f"df = df.drop_duplicates(keep='{keep}')" - def get_sklearn_transformer( - self, suggestion: FixSuggestion - ) -> Tuple[Optional[str], List[str]]: + def get_sklearn_transformer(self, suggestion: FixSuggestion) -> tuple[str | None, list[str]]: return None, [] diff --git a/hashprep/preparers/strategies/encoding.py b/hashprep/preparers/strategies/encoding.py index a33703d..234faaf 100644 --- a/hashprep/preparers/strategies/encoding.py +++ b/hashprep/preparers/strategies/encoding.py @@ -1,5 +1,3 @@ -from typing import List, Optional, Tuple - from ..models import EncodeMethod, FixSuggestion from .base import FixStrategy @@ -40,9 +38,7 @@ def generate_pandas_code(self, suggestion: FixSuggestion) -> str: return f"df = pd.get_dummies(df, columns={self._format_column_list(cols)})" - def get_sklearn_transformer( - self, suggestion: FixSuggestion - ) -> Tuple[Optional[str], List[str]]: + def get_sklearn_transformer(self, suggestion: FixSuggestion) -> tuple[str | None, list[str]]: method = suggestion.method cols = suggestion.columns diff --git a/hashprep/preparers/strategies/imputation.py b/hashprep/preparers/strategies/imputation.py index a4171ae..56342f8 100644 --- a/hashprep/preparers/strategies/imputation.py +++ b/hashprep/preparers/strategies/imputation.py @@ -1,5 +1,3 @@ -from typing import List, Optional, Tuple - from ..models import FixSuggestion, ImputeMethod from .base import FixStrategy @@ -24,9 +22,7 @@ def generate_pandas_code(self, suggestion: FixSuggestion) -> str: if method == ImputeMethod.MODE.value: lines = [] for col in suggestion.columns: - lines.append( - f"df['{col}'] = df['{col}'].fillna(df['{col}'].mode().iloc[0])" - ) + lines.append(f"df['{col}'] = df['{col}'].fillna(df['{col}'].mode().iloc[0])") return "\n".join(lines) if method == ImputeMethod.CONSTANT.value: @@ -44,9 +40,7 @@ def generate_pandas_code(self, suggestion: FixSuggestion) -> str: return f"df[{cols}] = df[{cols}].fillna(df[{cols}].median())" - def get_sklearn_transformer( - self, suggestion: FixSuggestion - ) -> Tuple[Optional[str], List[str]]: + def get_sklearn_transformer(self, suggestion: FixSuggestion) -> tuple[str | None, list[str]]: method = suggestion.method cols = suggestion.columns diff --git a/hashprep/preparers/strategies/outlier.py b/hashprep/preparers/strategies/outlier.py index 81e0367..7ac3a89 100644 --- a/hashprep/preparers/strategies/outlier.py +++ b/hashprep/preparers/strategies/outlier.py @@ -1,5 +1,3 @@ -from typing import List, Optional, Tuple - from ..models import FixSuggestion from .base import FixStrategy @@ -20,12 +18,8 @@ def generate_pandas_code(self, suggestion: FixSuggestion) -> str: for col in cols: lines.append(f"q1_{col}, q3_{col} = df['{col}'].quantile([0.25, 0.75])") lines.append(f"iqr_{col} = q3_{col} - q1_{col}") - lines.append( - f"lower_{col}, upper_{col} = q1_{col} - 1.5 * iqr_{col}, q3_{col} + 1.5 * iqr_{col}" - ) - lines.append( - f"df['{col}'] = df['{col}'].clip(lower=lower_{col}, upper=upper_{col})" - ) + lines.append(f"lower_{col}, upper_{col} = q1_{col} - 1.5 * iqr_{col}, q3_{col} + 1.5 * iqr_{col}") + lines.append(f"df['{col}'] = df['{col}'].clip(lower=lower_{col}, upper=upper_{col})") return "\n".join(lines) if clip_method == "percentile": @@ -33,12 +27,8 @@ def generate_pandas_code(self, suggestion: FixSuggestion) -> str: upper_pct = suggestion.parameters.get("upper_pct", 0.99) lines = [] for col in cols: - lines.append( - f"low_{col}, high_{col} = df['{col}'].quantile([{lower_pct}, {upper_pct}])" - ) - lines.append( - f"df['{col}'] = df['{col}'].clip(lower=low_{col}, upper=high_{col})" - ) + lines.append(f"low_{col}, high_{col} = df['{col}'].quantile([{lower_pct}, {upper_pct}])") + lines.append(f"df['{col}'] = df['{col}'].clip(lower=low_{col}, upper=high_{col})") return "\n".join(lines) if clip_method == "zscore": @@ -47,20 +37,14 @@ def generate_pandas_code(self, suggestion: FixSuggestion) -> str: for col in cols: lines.append(f"mean_{col} = df['{col}'].mean()") lines.append(f"std_{col} = df['{col}'].std()") - lines.append( - f"lower_{col} = mean_{col} - {z_threshold} * std_{col}" - ) - lines.append( - f"upper_{col} = mean_{col} + {z_threshold} * std_{col}" - ) - lines.append( - f"df['{col}'] = df['{col}'].clip(lower=lower_{col}, upper=upper_{col})" - ) + lines.append(f"lower_{col} = mean_{col} - {z_threshold} * std_{col}") + lines.append(f"upper_{col} = mean_{col} + {z_threshold} * std_{col}") + lines.append(f"df['{col}'] = df['{col}'].clip(lower=lower_{col}, upper=upper_{col})") return "\n".join(lines) return self._generate_iqr_code(cols) - def _generate_iqr_code(self, cols: List[str]) -> str: + def _generate_iqr_code(self, cols: list[str]) -> str: lines = [] for col in cols: lines.append(f"q1_{col}, q3_{col} = df['{col}'].quantile([0.25, 0.75])") @@ -70,7 +54,5 @@ def _generate_iqr_code(self, cols: List[str]) -> str: ) return "\n".join(lines) - def get_sklearn_transformer( - self, suggestion: FixSuggestion - ) -> Tuple[Optional[str], List[str]]: + def get_sklearn_transformer(self, suggestion: FixSuggestion) -> tuple[str | None, list[str]]: return None, suggestion.columns diff --git a/hashprep/preparers/strategies/scaling.py b/hashprep/preparers/strategies/scaling.py index 0140f88..6b05178 100644 --- a/hashprep/preparers/strategies/scaling.py +++ b/hashprep/preparers/strategies/scaling.py @@ -1,5 +1,3 @@ -from typing import List, Optional, Tuple - from ..models import FixSuggestion, ScaleMethod from .base import FixStrategy @@ -27,9 +25,7 @@ def generate_pandas_code(self, suggestion: FixSuggestion) -> str: lines.append(f"q1_{col} = df['{col}'].quantile(0.25)") lines.append(f"q3_{col} = df['{col}'].quantile(0.75)") lines.append(f"iqr_{col} = q3_{col} - q1_{col}") - lines.append( - f"df['{col}'] = (df['{col}'] - df['{col}'].median()) / iqr_{col}" - ) + lines.append(f"df['{col}'] = (df['{col}'] - df['{col}'].median()) / iqr_{col}") return "\n".join(lines) if method == ScaleMethod.MAXABS.value: @@ -37,9 +33,7 @@ def generate_pandas_code(self, suggestion: FixSuggestion) -> str: return f"df[{cols}] = (df[{cols}] - df[{cols}].mean()) / df[{cols}].std()" - def get_sklearn_transformer( - self, suggestion: FixSuggestion - ) -> Tuple[Optional[str], List[str]]: + def get_sklearn_transformer(self, suggestion: FixSuggestion) -> tuple[str | None, list[str]]: method = suggestion.method cols = suggestion.columns diff --git a/hashprep/preparers/strategies/transformation.py b/hashprep/preparers/strategies/transformation.py index 15b3997..b00a405 100644 --- a/hashprep/preparers/strategies/transformation.py +++ b/hashprep/preparers/strategies/transformation.py @@ -1,5 +1,3 @@ -from typing import List, Optional, Tuple - from ..models import FixSuggestion, TransformMethod from .base import FixStrategy @@ -30,9 +28,7 @@ def generate_pandas_code(self, suggestion: FixSuggestion) -> str: "from scipy.stats import boxcox", ] for col in suggestion.columns: - lines.append( - f"df['{col}'], _ = boxcox(df['{col}'].clip(lower=1e-10).values)" - ) + lines.append(f"df['{col}'], _ = boxcox(df['{col}'].clip(lower=1e-10).values)") return "\n".join(lines) if method == TransformMethod.YEOJOHNSON.value: @@ -41,16 +37,12 @@ def generate_pandas_code(self, suggestion: FixSuggestion) -> str: "pt = PowerTransformer(method='yeo-johnson')", ] for col in suggestion.columns: - lines.append( - f"df[['{col}']] = pt.fit_transform(df[['{col}']])" - ) + lines.append(f"df[['{col}']] = pt.fit_transform(df[['{col}']])") return "\n".join(lines) return f"df[{cols}] = np.log1p(df[{cols}].clip(lower=0))" - def get_sklearn_transformer( - self, suggestion: FixSuggestion - ) -> Tuple[Optional[str], List[str]]: + def get_sklearn_transformer(self, suggestion: FixSuggestion) -> tuple[str | None, list[str]]: method = suggestion.method cols = suggestion.columns diff --git a/hashprep/preparers/suggestions.py b/hashprep/preparers/suggestions.py index 7a1fa8d..bb9dc3e 100644 --- a/hashprep/preparers/suggestions.py +++ b/hashprep/preparers/suggestions.py @@ -1,5 +1,3 @@ -from typing import Dict, List, Optional - from ..checks.core import Issue from .fix_registry import FixRegistry from .models import FixSuggestion @@ -13,10 +11,10 @@ class SuggestionProvider: def __init__( self, - issues: List[Issue], - column_types: Optional[Dict[str, str]] = None, - target_col: Optional[str] = None, - column_stats: Optional[Dict[str, Dict]] = None, + issues: list[Issue], + column_types: dict[str, str] | None = None, + target_col: str | None = None, + column_stats: dict[str, dict] | None = None, ): self.issues = issues self.column_types = column_types or {} @@ -24,9 +22,9 @@ def __init__( self.column_stats = column_stats or {} self.registry = FixRegistry(self.column_types, target_col, column_stats) - def get_suggestions(self) -> List[FixSuggestion]: + def get_suggestions(self) -> list[FixSuggestion]: """Generate all fix suggestions, deduplicated and prioritized.""" - suggestions: List[FixSuggestion] = [] + suggestions: list[FixSuggestion] = [] seen_columns: set = set() sorted_issues = sorted( @@ -44,9 +42,9 @@ def get_suggestions(self) -> List[FixSuggestion]: return sorted(suggestions, key=lambda s: s.priority) - def get_suggestions_by_type(self) -> Dict[str, List[FixSuggestion]]: + def get_suggestions_by_type(self) -> dict[str, list[FixSuggestion]]: """Group suggestions by fix type for organized output.""" - grouped: Dict[str, List[FixSuggestion]] = {} + grouped: dict[str, list[FixSuggestion]] = {} for suggestion in self.get_suggestions(): key = suggestion.fix_type.value if key not in grouped: @@ -54,7 +52,7 @@ def get_suggestions_by_type(self) -> Dict[str, List[FixSuggestion]]: grouped[key].append(suggestion) return grouped - def get_legacy_suggestions(self) -> List[Dict]: + def get_legacy_suggestions(self) -> list[dict]: """ Return suggestions in legacy format for backward compatibility. Maps to the old {issue, code} dict format. diff --git a/hashprep/reports/__init__.py b/hashprep/reports/__init__.py index 7743704..eb3709f 100644 --- a/hashprep/reports/__init__.py +++ b/hashprep/reports/__init__.py @@ -1 +1 @@ -from .generators import generate_report \ No newline at end of file +from .generators import generate_report as generate_report diff --git a/hashprep/reports/generators.py b/hashprep/reports/generators.py index a1d67da..a52f8af 100644 --- a/hashprep/reports/generators.py +++ b/hashprep/reports/generators.py @@ -9,9 +9,9 @@ def generate(self, summary, full=False, output_file=None): # Lazy loading report classes def _load_generators(): - from .markdown import MarkdownReport - from .json import JsonReport from .html import HtmlReport + from .json import JsonReport + from .markdown import MarkdownReport from .pdf import PdfReport return { @@ -21,17 +21,19 @@ def _load_generators(): "pdf": PdfReport(), } + # get generators dictionary def get_generators(): - if not hasattr(get_generators, 'cache'): + if not hasattr(get_generators, "cache"): get_generators.cache = _load_generators() return get_generators.cache + def generate_report(summary, format="md", full=False, output_file=None, theme="minimal"): generators = get_generators() if format not in generators: raise ValueError(f"Unsupported format: {format}") - + if format in ["html", "pdf"]: return generators[format].generate(summary, full, output_file, theme=theme) return generators[format].generate(summary, full, output_file) diff --git a/hashprep/reports/html.py b/hashprep/reports/html.py index f07853f..a86d27f 100644 --- a/hashprep/reports/html.py +++ b/hashprep/reports/html.py @@ -1,6 +1,5 @@ import json from datetime import datetime -from typing import Dict, List import pandas as pd import yaml @@ -102,9 +101,9 @@ def generate(self, summary, full=False, output_file=None, theme="minimal", pdf_m f.write(html_content) return html_content - def _group_alerts_by_type(self, issues: List[Dict]) -> Dict[str, List[Dict]]: + def _group_alerts_by_type(self, issues: list[dict]) -> dict[str, list[dict]]: """Group issues into display categories for the alerts section.""" - groups: Dict[str, List[Dict]] = {} + groups: dict[str, list[dict]] = {} for issue in issues: alert_type = self.ALERT_TYPE_MAPPING.get(issue["category"], "Other") @@ -114,7 +113,7 @@ def _group_alerts_by_type(self, issues: List[Dict]) -> Dict[str, List[Dict]]: return groups - def _generate_config(self, summary) -> Dict: + def _generate_config(self, summary) -> dict: """Generate configuration dict for download.""" reproduction_info = summary["summaries"].get("reproduction_info", {}) return { diff --git a/hashprep/reports/json.py b/hashprep/reports/json.py index 6594c1d..ec9b92a 100644 --- a/hashprep/reports/json.py +++ b/hashprep/reports/json.py @@ -1,6 +1,5 @@ import json from datetime import datetime -from typing import Dict import numpy as np @@ -22,7 +21,7 @@ def generate(self, summary, full=False, output_file=None): dataset_info = summary["summaries"]["dataset_info"] reproduction_info = summary["summaries"].get("reproduction_info", {}) - report: Dict = { + report: dict = { "metadata": { "generated": datetime.now().isoformat(), "version": hashprep.__version__, diff --git a/hashprep/reports/markdown.py b/hashprep/reports/markdown.py index 2b48dea..39282fc 100644 --- a/hashprep/reports/markdown.py +++ b/hashprep/reports/markdown.py @@ -1,14 +1,14 @@ import base64 import os -from typing import Dict, List import pandas as pd + +import hashprep + from ..utils.logging import get_logger _log = get_logger("reports.markdown") -import hashprep - class MarkdownReport: ALERT_TYPE_MAPPING = { @@ -114,8 +114,12 @@ def generate(self, summary, full=False, output_file=None): # Summary line content += "| Metric | Value |\n|--------|-------|\n" - content += f"| Distinct | {stats.get('distinct_count', 0)} ({stats.get('distinct_percentage', 0):.1f}%) |\n" - content += f"| Missing | {stats.get('missing_count', 0)} ({stats.get('missing_percentage', 0):.1f}%) |\n" + content += ( + f"| Distinct | {stats.get('distinct_count', 0)} ({stats.get('distinct_percentage', 0):.1f}%) |\n" + ) + content += ( + f"| Missing | {stats.get('missing_count', 0)} ({stats.get('missing_percentage', 0):.1f}%) |\n" + ) if cat == "Numeric": mean_val = stats.get("mean") content += f"| Mean | {f'{mean_val:.6g}' if mean_val is not None else 'N/A'} |\n" @@ -272,9 +276,9 @@ def generate(self, summary, full=False, output_file=None): f.write(content) return content - def _group_alerts_by_type(self, issues: List[Dict]) -> Dict[str, List[Dict]]: + def _group_alerts_by_type(self, issues: list[dict]) -> dict[str, list[dict]]: """Group issues into display categories.""" - groups: Dict[str, List[Dict]] = {} + groups: dict[str, list[dict]] = {} for issue in issues: alert_type = self.ALERT_TYPE_MAPPING.get(issue["category"], "Other") if alert_type not in groups: diff --git a/hashprep/reports/pdf.py b/hashprep/reports/pdf.py index b6330f8..db8dccf 100644 --- a/hashprep/reports/pdf.py +++ b/hashprep/reports/pdf.py @@ -1,5 +1,4 @@ from datetime import datetime -from typing import Dict, List import pandas as pd from jinja2 import Template @@ -88,8 +87,8 @@ def generate(self, summary, full=False, output_file=None, **kwargs): f.write(pdf_content) return pdf_content - def _group_alerts_by_type(self, issues: List[Dict]) -> Dict[str, List[Dict]]: - groups: Dict[str, List[Dict]] = {} + def _group_alerts_by_type(self, issues: list[dict]) -> dict[str, list[dict]]: + groups: dict[str, list[dict]] = {} for issue in issues: alert_type = self.ALERT_TYPE_MAPPING.get(issue["category"], "Other") if alert_type not in groups: diff --git a/hashprep/summaries/__init__.py b/hashprep/summaries/__init__.py index dba2c9d..fb3da47 100644 --- a/hashprep/summaries/__init__.py +++ b/hashprep/summaries/__init__.py @@ -1,11 +1,21 @@ from .dataset import ( - get_dataset_preview, - summarize_dataset_info, - summarize_variable_types, - add_reproduction_info, - get_duplicate_info, - summarize_variable_type_counts, -) -from .variables import summarize_variables -from .interactions import summarize_interactions -from .missing import summarize_missing_values \ No newline at end of file + add_reproduction_info as add_reproduction_info, +) +from .dataset import ( + get_dataset_preview as get_dataset_preview, +) +from .dataset import ( + get_duplicate_info as get_duplicate_info, +) +from .dataset import ( + summarize_dataset_info as summarize_dataset_info, +) +from .dataset import ( + summarize_variable_type_counts as summarize_variable_type_counts, +) +from .dataset import ( + summarize_variable_types as summarize_variable_types, +) +from .interactions import summarize_interactions as summarize_interactions +from .missing import summarize_missing_values as summarize_missing_values +from .variables import summarize_variables as summarize_variables diff --git a/hashprep/summaries/dataset.py b/hashprep/summaries/dataset.py index 9aba682..92ae295 100644 --- a/hashprep/summaries/dataset.py +++ b/hashprep/summaries/dataset.py @@ -1,8 +1,7 @@ -from typing import Optional, Dict +import hashlib -import pandas as pd import numpy as np -import hashlib +import pandas as pd import hashprep @@ -16,7 +15,7 @@ def get_dataset_preview(df): return {"head": head, "tail": tail, "sample": sample} -def summarize_dataset_info(df: pd.DataFrame) -> Dict: +def summarize_dataset_info(df: pd.DataFrame) -> dict: rows = df.shape[0] cols = df.shape[1] total_cells = rows * cols @@ -33,14 +32,12 @@ def summarize_dataset_info(df: pd.DataFrame) -> Dict: "average_record_size_bytes": float(round(total_memory_bytes / rows, 1)) if rows > 0 else 0.0, "missing_cells": missing_cells, "total_cells": int(total_cells), - "missing_percentage": float( - round(missing_cells / total_cells * 100, 1) - ) if total_cells > 0 else 0.0, + "missing_percentage": float(round(missing_cells / total_cells * 100, 1)) if total_cells > 0 else 0.0, } } -def get_duplicate_info(df: pd.DataFrame) -> Dict: +def get_duplicate_info(df: pd.DataFrame) -> dict: """Return duplicate row count and percentage.""" rows = len(df) duplicate_count = int(df.duplicated().sum()) @@ -51,7 +48,7 @@ def get_duplicate_info(df: pd.DataFrame) -> Dict: } -def summarize_variable_type_counts(df: pd.DataFrame, column_types: Dict[str, str]) -> Dict[str, int]: +def summarize_variable_type_counts(df: pd.DataFrame, column_types: dict[str, str]) -> dict[str, int]: """Count variables by inferred type.""" type_counts = { "Numeric": 0, @@ -61,7 +58,7 @@ def summarize_variable_type_counts(df: pd.DataFrame, column_types: Dict[str, str "Boolean": 0, "Unsupported": 0, } - for col, typ in column_types.items(): + for _col, typ in column_types.items(): if typ in type_counts: type_counts[typ] += 1 else: @@ -69,25 +66,21 @@ def summarize_variable_type_counts(df: pd.DataFrame, column_types: Dict[str, str return type_counts -def summarize_variable_types(df: pd.DataFrame, column_types: Optional[Dict[str, str]] = None) -> Dict[str, str]: +def summarize_variable_types(df: pd.DataFrame, column_types: dict[str, str] | None = None) -> dict[str, str]: """ Summarize column types using infer_types if column_types not provided. """ if column_types is None: from ..utils.type_inference import infer_types + column_types = infer_types(df) return column_types -def add_reproduction_info(df: pd.DataFrame) -> Dict: +def add_reproduction_info(df: pd.DataFrame) -> dict: """Generate reproduction metadata for the analysis.""" - dataset_hash = hashlib.md5( - pd.util.hash_pandas_object(df, index=True).values - ).hexdigest() + dataset_hash = hashlib.md5(pd.util.hash_pandas_object(df, index=True).values).hexdigest() return { "dataset_hash": dataset_hash, "software_version": hashprep.__version__, } - - - diff --git a/hashprep/summaries/interactions.py b/hashprep/summaries/interactions.py index 2ef9ae3..775c897 100644 --- a/hashprep/summaries/interactions.py +++ b/hashprep/summaries/interactions.py @@ -1,6 +1,7 @@ +import numpy as np import pandas as pd from scipy.stats import chi2_contingency, f_oneway -import numpy as np + from ..utils.logging import get_logger _log = get_logger("summaries.interactions") @@ -17,11 +18,7 @@ def summarize_interactions(df): def _scatter_plots_numeric(df): numeric_columns = df.select_dtypes(include="number").columns.tolist() - pairs = [ - (c1, c2) - for i, c1 in enumerate(numeric_columns) - for c2 in numeric_columns[i + 1 :] - ] + pairs = [(c1, c2) for i, c1 in enumerate(numeric_columns) for c2 in numeric_columns[i + 1 :]] return pairs diff --git a/hashprep/summaries/missing.py b/hashprep/summaries/missing.py index 263a82d..5908cec 100644 --- a/hashprep/summaries/missing.py +++ b/hashprep/summaries/missing.py @@ -1,17 +1,7 @@ -import pandas as pd - - def summarize_missing_values(df): missing_count = {col: int(val) for col, val in df.isnull().sum().to_dict().items()} - missing_percentage = { - col: float(val) - for col, val in (df.isnull().mean() * 100).round(2).to_dict().items() - } - missing_patterns = { - col: df[df[col].isna()].index.tolist() - for col in df.columns - if df[col].isna().any() - } + missing_percentage = {col: float(val) for col, val in (df.isnull().mean() * 100).round(2).to_dict().items()} + missing_patterns = {col: df[df[col].isna()].index.tolist() for col in df.columns if df[col].isna().any()} missing_data = {} missing_data["missing_values"] = {"count": missing_count, "percentage": missing_percentage} diff --git a/hashprep/summaries/variables.py b/hashprep/summaries/variables.py index 6b5243f..ba019e0 100644 --- a/hashprep/summaries/variables.py +++ b/hashprep/summaries/variables.py @@ -1,13 +1,16 @@ -import pandas as pd -import numpy as np -import unicodedata import re +import unicodedata from collections import defaultdict + +import numpy as np +import pandas as pd from scipy.stats import median_abs_deviation + from ..config import DEFAULT_CONFIG _SUMMARY = DEFAULT_CONFIG.summaries + def get_monotonicity(series: pd.Series) -> str: if series.is_monotonic_increasing: return "increasing" @@ -20,6 +23,7 @@ def get_monotonicity(series: pd.Series) -> str: def summarize_variables(df, column_types=None): if column_types is None: from ..utils.type_inference import infer_types + column_types = infer_types(df) inferred_types = column_types variables = {} @@ -27,9 +31,7 @@ def summarize_variables(df, column_types=None): typ = inferred_types.get(column, "Unsupported") non_missing_count = df[column].notna().sum() distinct_count = df[column].nunique() - distinct_percentage = ( - (distinct_count / non_missing_count * 100) if non_missing_count > 0 else 0 - ) + distinct_percentage = (distinct_count / non_missing_count * 100) if non_missing_count > 0 else 0 missing_count = int(df[column].isna().sum()) missing_percentage = (missing_count / len(df) * 100) if len(df) > 0 else 0 memory_size = df[column].memory_usage(deep=True) @@ -116,13 +118,10 @@ def _summarize_numeric(df, col): "counts": [int(x) for x in hist], } vc = series.value_counts().head(_SUMMARY.top_n_values) - common_values = { - str(v): {"count": int(c), "percentage": float(c / n * 100)} - for v, c in vc.items() - } + common_values = {str(v): {"count": int(c), "percentage": float(c / n * 100)} for v, c in vc.items()} extremes = { - "minimum_10": [float(x) for x in sorted(series)[:_SUMMARY.extreme_values_count]], - "maximum_10": [float(x) for x in sorted(series)[-_SUMMARY.extreme_values_count:]], + "minimum_10": [float(x) for x in sorted(series)[: _SUMMARY.extreme_values_count]], + "maximum_10": [float(x) for x in sorted(series)[-_SUMMARY.extreme_values_count :]], } stats = { "infinite_count": infinite_count, @@ -180,7 +179,6 @@ def _summarize_text(df, col): }, } lengths = series.str.len() - n = len(series) all_text = "".join(series) total_chars = len(all_text) distinct_chars = len(set(all_text)) @@ -309,9 +307,7 @@ def _summarize_categorical(df, col): text_summary = _summarize_text(df, col) n = len(series) vc = series.value_counts().head(10) - common_values = { - v: {"count": int(c), "percentage": float(c / n * 100)} for v, c in vc.items() - } + common_values = {v: {"count": int(c), "percentage": float(c / n * 100)} for v, c in vc.items()} stats = { "overview": text_summary["overview"], "categories": { @@ -372,9 +368,6 @@ def _summarize_boolean(df, col): bool_series = pd.to_numeric(series, errors="coerce").notna().astype(bool) vc = bool_series.value_counts() n = len(series) - common_values = { - str(k): {"count": int(v), "percentage": float(v / n * 100)} - for k, v in vc.items() - } + common_values = {str(k): {"count": int(v), "percentage": float(v / n * 100)} for k, v in vc.items()} stats = {"common_values": common_values} return stats diff --git a/hashprep/utils/sampling.py b/hashprep/utils/sampling.py index 8c7aa5f..7c2215b 100644 --- a/hashprep/utils/sampling.py +++ b/hashprep/utils/sampling.py @@ -1,5 +1,5 @@ -from dataclasses import dataclass, field -from typing import Dict, Literal, Optional, Tuple +from dataclasses import dataclass +from typing import Literal import pandas as pd @@ -16,8 +16,8 @@ class SamplingConfig: max_rows: int = DEFAULT_MAX_ROWS sample_method: Literal["random", "stratified", "systematic", "head"] = "random" - random_state: Optional[int] = 42 - stratify_column: Optional[str] = None + random_state: int | None = 42 + stratify_column: str | None = None memory_threshold_mb: float = DEFAULT_MEMORY_THRESHOLD_MB enabled: bool = True @@ -25,10 +25,10 @@ class SamplingConfig: class DatasetSampler: """Handles sampling of large datasets for efficient analysis.""" - def __init__(self, config: Optional[SamplingConfig] = None): + def __init__(self, config: SamplingConfig | None = None): self.config = config or SamplingConfig() - self.original_shape: Optional[Tuple[int, int]] = None - self.sample_fraction: Optional[float] = None + self.original_shape: tuple[int, int] | None = None + self.sample_fraction: float | None = None self.was_sampled: bool = False def should_sample(self, df: pd.DataFrame) -> bool: @@ -91,9 +91,7 @@ def _stratified_sample(self, df: pd.DataFrame, target_rows: int) -> pd.DataFrame n_samples = max(1, int(proportions[name] * target_rows)) n_samples = min(n_samples, len(group), remaining) if n_samples > 0: - samples.append( - group.sample(n=n_samples, random_state=self.config.random_state) - ) + samples.append(group.sample(n=n_samples, random_state=self.config.random_state)) remaining -= n_samples if remaining <= 0: break @@ -106,14 +104,12 @@ def _stratified_sample(self, df: pd.DataFrame, target_rows: int) -> pd.DataFrame if len(result) < target_rows and len(result) < len(df): additional_needed = min(target_rows - len(result), len(df) - len(result)) remaining_indices = df.index.difference(result.index) - additional = df.loc[remaining_indices].sample( - n=additional_needed, random_state=self.config.random_state - ) + additional = df.loc[remaining_indices].sample(n=additional_needed, random_state=self.config.random_state) result = pd.concat([result, additional]) return result.sample(frac=1, random_state=self.config.random_state) - def get_sampling_info(self) -> Dict: + def get_sampling_info(self) -> dict: """Return metadata about sampling performed.""" return { "was_sampled": self.was_sampled, diff --git a/hashprep/utils/type_inference.py b/hashprep/utils/type_inference.py index ded5d3f..49676ed 100644 --- a/hashprep/utils/type_inference.py +++ b/hashprep/utils/type_inference.py @@ -1,17 +1,17 @@ import pandas as pd -from typing import Dict from ..config import DEFAULT_CONFIG _TYPE_CFG = DEFAULT_CONFIG.type_inference CONFIG = { - 'cat_cardinality_threshold': _TYPE_CFG.cat_cardinality_threshold, - 'cat_percentage_threshold': _TYPE_CFG.cat_percentage_threshold, - 'num_low_cat_threshold': _TYPE_CFG.num_low_cat_threshold, - 'bool_mappings': _TYPE_CFG.bool_mappings, + "cat_cardinality_threshold": _TYPE_CFG.cat_cardinality_threshold, + "cat_percentage_threshold": _TYPE_CFG.cat_percentage_threshold, + "num_low_cat_threshold": _TYPE_CFG.num_low_cat_threshold, + "bool_mappings": _TYPE_CFG.bool_mappings, } -def infer_types(df: pd.DataFrame) -> Dict[str, str]: + +def infer_types(df: pd.DataFrame) -> dict[str, str]: """ Infer semantic types per ydata logic. Returns: {col: 'Numeric' | 'Categorical' | 'Text' | 'Unsupported'} @@ -20,38 +20,41 @@ def infer_types(df: pd.DataFrame) -> Dict[str, str]: for col in df.columns: series = df[col].dropna() if series.empty: - types[col] = 'Unsupported' + types[col] = "Unsupported" continue # Numeric inference (ydata's Numeric.contains_op + numeric_is_category) if pd.api.types.is_numeric_dtype(series) and not pd.api.types.is_bool_dtype(series): n_unique = series.nunique() - if 1 <= n_unique <= CONFIG['num_low_cat_threshold']: - types[col] = 'Categorical' # Low-card numeric → Categorical (e.g., SibSp, Parch) + if 1 <= n_unique <= CONFIG["num_low_cat_threshold"]: + types[col] = "Categorical" # Low-card numeric → Categorical (e.g., SibSp, Parch) else: - types[col] = 'Numeric' # High-card numeric (e.g., Age, Fare) + types[col] = "Numeric" # High-card numeric (e.g., Age, Fare) # String/Text inference (ydata's Text.contains_op + string_is_category) elif pd.api.types.is_string_dtype(series) or pd.api.types.is_object_dtype(series): n_unique = series.nunique() unique_pct = n_unique / len(series) - is_bool = all(s.lower() in CONFIG['bool_mappings'] for s in series[:5]) # Quick bool check + is_bool = all(s.lower() in CONFIG["bool_mappings"] for s in series[:5]) # Quick bool check if is_bool: - types[col] = 'Categorical' # Bool-like → Categorical - elif 1 <= n_unique <= CONFIG['cat_cardinality_threshold'] and unique_pct < CONFIG['cat_percentage_threshold']: - types[col] = 'Categorical' # Low-card string → Categorical (e.g., Sex, Embarked) + types[col] = "Categorical" # Bool-like → Categorical + elif ( + 1 <= n_unique <= CONFIG["cat_cardinality_threshold"] and unique_pct < CONFIG["cat_percentage_threshold"] + ): + types[col] = "Categorical" # Low-card string → Categorical (e.g., Sex, Embarked) else: - types[col] = 'Text' # High-card/unique → Text (e.g., Name, Cabin, Ticket) + types[col] = "Text" # High-card/unique → Text (e.g., Name, Cabin, Ticket) # Categorical dtype elif pd.api.types.is_categorical_dtype(series): - types[col] = 'Categorical' + types[col] = "Categorical" else: - types[col] = 'Unsupported' + types[col] = "Unsupported" return types + # Helper: Check if series is constant/empty (skip corr) def is_usable_for_corr(series: pd.Series) -> bool: - return series.nunique() > 1 and len(series.dropna()) > 1 \ No newline at end of file + return series.nunique() > 1 and len(series.dropna()) > 1 diff --git a/pyproject.toml b/pyproject.toml index 9438449..62f5d2c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,33 @@ include = ["hashprep*"] [dependency-groups] dev = [ "pytest>=9.0.2", + "ruff>=0.8.0", ] [project.scripts] hashprep = "hashprep.interfaces.cli.main:cli" + +[tool.ruff] +target-version = "py310" +line-length = 120 + +[tool.ruff.lint] +select = [ + "E", # pycodestyle errors + "W", # pycodestyle warnings + "F", # pyflakes + "I", # isort + "UP", # pyupgrade + "B", # flake8-bugbear + "SIM", # flake8-simplify +] +ignore = [ + "E501", # line too long (handled by formatter) + "B905", # zip without strict (requires 3.10+) + "SIM102", # collapsible-if (elif + nested if is often more readable) + "SIM108", # ternary operator (readability preference) + "UP007", # Optional[X] -> X | None (keep for 3.10 compat) +] + +[tool.ruff.lint.isort] +known-first-party = ["hashprep"] diff --git a/tests/test.py b/tests/test.py index 7936eca..81b023c 100755 --- a/tests/test.py +++ b/tests/test.py @@ -1,7 +1,8 @@ #!/usr/bin/env python3 -import hashprep import pandas as pd + +import hashprep from hashprep import DatasetAnalyzer TARGET_COLUMN = "Survived" diff --git a/tests/test_cli.py b/tests/test_cli.py index d19e7ac..b2b1e91 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -26,15 +26,10 @@ def temp_output_dir(): def run_cli(args, cwd=None): """Helper to run CLI commands.""" - cmd = ['uv', 'run', 'hashprep'] + args + cmd = ["uv", "run", "hashprep"] + args if cwd is None: cwd = Path(__file__).parent.parent - result = subprocess.run( - cmd, - capture_output=True, - text=True, - cwd=cwd - ) + result = subprocess.run(cmd, capture_output=True, text=True, cwd=cwd) return result @@ -43,63 +38,60 @@ class TestCLIScan: def test_scan_basic(self, titanic_csv): """Test basic scan command.""" - result = run_cli(['scan', titanic_csv]) + result = run_cli(["scan", titanic_csv]) assert result.returncode == 0 - assert 'Dataset Health Check' in result.stdout - assert 'Critical Issues:' in result.stdout - assert 'Warnings:' in result.stdout + assert "Dataset Health Check" in result.stdout + assert "Critical Issues:" in result.stdout + assert "Warnings:" in result.stdout def test_scan_critical_only(self, titanic_csv): """Test scan with --critical-only flag.""" - result = run_cli(['scan', titanic_csv, '--critical-only']) + result = run_cli(["scan", titanic_csv, "--critical-only"]) assert result.returncode == 0 - assert 'Critical Issues:' in result.stdout + assert "Critical Issues:" in result.stdout def test_scan_quiet(self, titanic_csv): """Test scan with --quiet flag.""" - result = run_cli(['scan', titanic_csv, '--quiet']) + result = run_cli(["scan", titanic_csv, "--quiet"]) assert result.returncode == 0 # Should only show counts - assert 'critical' in result.stdout.lower() + assert "critical" in result.stdout.lower() def test_scan_json_output(self, titanic_csv): """Test scan with --json flag.""" - result = run_cli(['scan', titanic_csv, '--json']) + result = run_cli(["scan", titanic_csv, "--json"]) assert result.returncode == 0 # Should be valid JSON data = json.loads(result.stdout) - assert 'critical_issues' in data or 'critical_count' in data - assert 'warnings' in data or 'warning_count' in data - assert 'issues' in data + assert "critical_issues" in data or "critical_count" in data + assert "warnings" in data or "warning_count" in data + assert "issues" in data def test_scan_with_target(self, titanic_csv): """Test scan with target column.""" - result = run_cli(['scan', titanic_csv, '--target', 'Survived']) + result = run_cli(["scan", titanic_csv, "--target", "Survived"]) assert result.returncode == 0 - assert 'Dataset Health Check' in result.stdout + assert "Dataset Health Check" in result.stdout def test_scan_specific_checks(self, titanic_csv): """Test scan with specific checks.""" - result = run_cli([ - 'scan', titanic_csv, - '--checks', 'outliers,duplicates,high_missing_values' - ]) + result = run_cli(["scan", titanic_csv, "--checks", "outliers,duplicates,high_missing_values"]) assert result.returncode == 0 - assert 'Dataset Health Check' in result.stdout + assert "Dataset Health Check" in result.stdout def test_scan_with_sampling(self, titanic_csv): """Test scan with custom sample size.""" - result = run_cli(['scan', titanic_csv, '--sample-size', '500']) + result = run_cli(["scan", titanic_csv, "--sample-size", "500"]) assert result.returncode == 0 - if 'sample' in result.stdout.lower(): - assert '56.1%' in result.stdout # 500/891 + if "sample" in result.stdout.lower(): + assert "56.1%" in result.stdout # 500/891 class TestCLIDetails: @@ -107,27 +99,24 @@ class TestCLIDetails: def test_details_basic(self, titanic_csv): """Test basic details command.""" - result = run_cli(['details', titanic_csv]) + result = run_cli(["details", titanic_csv]) assert result.returncode == 0 - assert 'Detailed Analysis' in result.stdout - assert 'Critical Issues:' in result.stdout - assert 'Warnings:' in result.stdout - assert 'Dataset Summary:' in result.stdout + assert "Detailed Analysis" in result.stdout + assert "Critical Issues:" in result.stdout + assert "Warnings:" in result.stdout + assert "Dataset Summary:" in result.stdout def test_details_with_target(self, titanic_csv): """Test details with target column.""" - result = run_cli(['details', titanic_csv, '--target', 'Survived']) + result = run_cli(["details", titanic_csv, "--target", "Survived"]) assert result.returncode == 0 - assert 'Detailed Analysis' in result.stdout + assert "Detailed Analysis" in result.stdout def test_details_specific_checks(self, titanic_csv): """Test details with specific checks.""" - result = run_cli([ - 'details', titanic_csv, - '--checks', 'high_missing_values,outliers' - ]) + result = run_cli(["details", titanic_csv, "--checks", "high_missing_values,outliers"]) assert result.returncode == 0 @@ -137,120 +126,101 @@ class TestCLIReport: def test_report_markdown(self, titanic_csv, temp_output_dir): """Test Markdown report generation.""" - result = run_cli(['report', titanic_csv, '--format', 'md'], cwd=temp_output_dir) + result = run_cli(["report", titanic_csv, "--format", "md"], cwd=temp_output_dir) assert result.returncode == 0 - assert 'Report saved to:' in result.stdout - assert 'train_hashprep_report.md' in result.stdout + assert "Report saved to:" in result.stdout + assert "train_hashprep_report.md" in result.stdout # Check file was created - report_file = os.path.join(temp_output_dir, 'train_hashprep_report.md') + report_file = os.path.join(temp_output_dir, "train_hashprep_report.md") assert os.path.exists(report_file) def test_report_json(self, titanic_csv, temp_output_dir): """Test JSON report generation.""" - result = run_cli(['report', titanic_csv, '--format', 'json'], cwd=temp_output_dir) + result = run_cli(["report", titanic_csv, "--format", "json"], cwd=temp_output_dir) assert result.returncode == 0 - assert 'train_hashprep_report.json' in result.stdout + assert "train_hashprep_report.json" in result.stdout # Verify JSON is valid - report_file = os.path.join(temp_output_dir, 'train_hashprep_report.json') + report_file = os.path.join(temp_output_dir, "train_hashprep_report.json") assert os.path.exists(report_file) with open(report_file) as f: data = json.load(f) - assert 'metadata' in data - assert 'dataset_overview' in data + assert "metadata" in data + assert "dataset_overview" in data def test_report_html_minimal(self, titanic_csv, temp_output_dir): """Test HTML report with minimal theme.""" - result = run_cli([ - 'report', titanic_csv, - '--format', 'html', - '--theme', 'minimal', - '--full' - ], cwd=temp_output_dir) + result = run_cli( + ["report", titanic_csv, "--format", "html", "--theme", "minimal", "--full"], cwd=temp_output_dir + ) assert result.returncode == 0 - assert 'train_hashprep_report.html' in result.stdout + assert "train_hashprep_report.html" in result.stdout - report_file = os.path.join(temp_output_dir, 'train_hashprep_report.html') + report_file = os.path.join(temp_output_dir, "train_hashprep_report.html") assert os.path.exists(report_file) def test_report_html_neubrutalism(self, titanic_csv, temp_output_dir): """Test HTML report with neubrutalism theme.""" - result = run_cli([ - 'report', titanic_csv, - '--format', 'html', - '--theme', 'neubrutalism', - '--full' - ], cwd=temp_output_dir) + result = run_cli( + ["report", titanic_csv, "--format", "html", "--theme", "neubrutalism", "--full"], cwd=temp_output_dir + ) assert result.returncode == 0 - assert 'train_hashprep_report.html' in result.stdout + assert "train_hashprep_report.html" in result.stdout def test_report_pdf(self, titanic_csv, temp_output_dir): """Test PDF report generation.""" - result = run_cli(['report', titanic_csv, '--format', 'pdf', '--full'], cwd=temp_output_dir) + result = run_cli(["report", titanic_csv, "--format", "pdf", "--full"], cwd=temp_output_dir) assert result.returncode == 0 - assert 'train_hashprep_report.pdf' in result.stdout + assert "train_hashprep_report.pdf" in result.stdout - report_file = os.path.join(temp_output_dir, 'train_hashprep_report.pdf') + report_file = os.path.join(temp_output_dir, "train_hashprep_report.pdf") assert os.path.exists(report_file) # Check PDF magic number - with open(report_file, 'rb') as f: - assert f.read(4) == b'%PDF' + with open(report_file, "rb") as f: + assert f.read(4) == b"%PDF" def test_report_with_code_generation(self, titanic_csv, temp_output_dir): """Test report with code generation.""" - result = run_cli(['report', titanic_csv, '--with-code'], cwd=temp_output_dir) + result = run_cli(["report", titanic_csv, "--with-code"], cwd=temp_output_dir) assert result.returncode == 0 - assert 'fixes script saved' in result.stdout - assert 'pipeline script saved' in result.stdout + assert "fixes script saved" in result.stdout + assert "pipeline script saved" in result.stdout # Check files were created - assert os.path.exists(os.path.join(temp_output_dir, 'train_hashprep_report_fixes.py')) - assert os.path.exists(os.path.join(temp_output_dir, 'train_hashprep_report_pipeline.py')) + assert os.path.exists(os.path.join(temp_output_dir, "train_hashprep_report_fixes.py")) + assert os.path.exists(os.path.join(temp_output_dir, "train_hashprep_report_pipeline.py")) def test_report_no_visualizations(self, titanic_csv, temp_output_dir): """Test report without visualizations.""" - result = run_cli([ - 'report', titanic_csv, - '--format', 'html', - '--no-visualizations' - ], cwd=temp_output_dir) + result = run_cli(["report", titanic_csv, "--format", "html", "--no-visualizations"], cwd=temp_output_dir) assert result.returncode == 0 def test_report_no_full(self, titanic_csv, temp_output_dir): """Test summary-only report.""" - result = run_cli([ - 'report', titanic_csv, - '--format', 'md', - '--no-full' - ], cwd=temp_output_dir) + result = run_cli(["report", titanic_csv, "--format", "md", "--no-full"], cwd=temp_output_dir) assert result.returncode == 0 def test_report_with_target(self, titanic_csv, temp_output_dir): """Test report with target column.""" - result = run_cli([ - 'report', titanic_csv, - '--target', 'Survived', - '--format', 'json' - ], cwd=temp_output_dir) + result = run_cli(["report", titanic_csv, "--target", "Survived", "--format", "json"], cwd=temp_output_dir) assert result.returncode == 0 def test_report_specific_checks(self, titanic_csv, temp_output_dir): """Test report with specific checks.""" - result = run_cli([ - 'report', titanic_csv, - '--checks', 'outliers,high_missing_values,duplicates', - '--format', 'md' - ], cwd=temp_output_dir) + result = run_cli( + ["report", titanic_csv, "--checks", "outliers,high_missing_values,duplicates", "--format", "md"], + cwd=temp_output_dir, + ) assert result.returncode == 0 @@ -260,10 +230,10 @@ class TestCLIVersion: def test_version(self): """Test version command.""" - result = run_cli(['version']) + result = run_cli(["version"]) assert result.returncode == 0 - assert 'hashprep' in result.stdout.lower() + assert "hashprep" in result.stdout.lower() # Should show version number assert any(char.isdigit() for char in result.stdout) @@ -273,26 +243,23 @@ class TestCLIErrorHandling: def test_invalid_file(self): """Test with non-existent file.""" - result = run_cli(['scan', 'nonexistent.csv']) + result = run_cli(["scan", "nonexistent.csv"]) assert result.returncode != 0 def test_invalid_format(self, titanic_csv): """Test with invalid report format.""" - result = run_cli(['report', titanic_csv, '--format', 'invalid']) + result = run_cli(["report", titanic_csv, "--format", "invalid"]) # Should handle gracefully or error - assert result.returncode != 0 or 'error' in result.stderr.lower() + assert result.returncode != 0 or "error" in result.stderr.lower() def test_invalid_check_name(self, titanic_csv): """Test with invalid check name.""" - result = run_cli([ - 'report', titanic_csv, - '--checks', 'invalid_check_name' - ]) + result = run_cli(["report", titanic_csv, "--checks", "invalid_check_name"]) assert result.returncode == 0 - assert 'Warning: Invalid checks ignored' in result.stdout + assert "Warning: Invalid checks ignored" in result.stdout # Fuzzy suggestion feature (if merged) # assert 'Did you mean' in result.stdout @@ -303,43 +270,46 @@ class TestCLIIntegration: def test_full_workflow(self, titanic_csv, temp_output_dir): """Test complete workflow: scan -> details -> report.""" # Step 1: Scan - result = run_cli(['scan', titanic_csv]) + result = run_cli(["scan", titanic_csv]) assert result.returncode == 0 # Step 2: Details - result = run_cli(['details', titanic_csv]) + result = run_cli(["details", titanic_csv]) assert result.returncode == 0 # Step 3: Generate all report formats - for fmt in ['md', 'json', 'html', 'pdf']: - result = run_cli([ - 'report', titanic_csv, - '--format', fmt, - '--full' - ], cwd=temp_output_dir) + for fmt in ["md", "json", "html", "pdf"]: + result = run_cli(["report", titanic_csv, "--format", fmt, "--full"], cwd=temp_output_dir) assert result.returncode == 0 # Step 4: Generate code - result = run_cli(['report', titanic_csv, '--with-code'], cwd=temp_output_dir) + result = run_cli(["report", titanic_csv, "--with-code"], cwd=temp_output_dir) assert result.returncode == 0 def test_ml_workflow_with_target(self, titanic_csv, temp_output_dir): """Test ML-focused workflow with target column.""" # Generate report with target and code - result = run_cli([ - 'report', titanic_csv, - '--target', 'Survived', - '--with-code', - '--format', 'html', - '--theme', 'minimal', - '--full' - ], cwd=temp_output_dir) + result = run_cli( + [ + "report", + titanic_csv, + "--target", + "Survived", + "--with-code", + "--format", + "html", + "--theme", + "minimal", + "--full", + ], + cwd=temp_output_dir, + ) assert result.returncode == 0 - assert os.path.exists(os.path.join(temp_output_dir, 'train_hashprep_report.html')) - assert os.path.exists(os.path.join(temp_output_dir, 'train_hashprep_report_fixes.py')) - assert os.path.exists(os.path.join(temp_output_dir, 'train_hashprep_report_pipeline.py')) + assert os.path.exists(os.path.join(temp_output_dir, "train_hashprep_report.html")) + assert os.path.exists(os.path.join(temp_output_dir, "train_hashprep_report_fixes.py")) + assert os.path.exists(os.path.join(temp_output_dir, "train_hashprep_report_pipeline.py")) -if __name__ == '__main__': - pytest.main([__file__, '-v']) +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/test_codegen.py b/tests/test_codegen.py index a20964e..2cc8776 100644 --- a/tests/test_codegen.py +++ b/tests/test_codegen.py @@ -1,7 +1,5 @@ """Tests for code generation module.""" -import pytest - from hashprep.preparers.codegen import CodeGenerator from hashprep.preparers.models import ( EncodeMethod, diff --git a/tests/test_drift.py b/tests/test_drift.py index 9cd8fd6..1401a1a 100644 --- a/tests/test_drift.py +++ b/tests/test_drift.py @@ -2,7 +2,6 @@ import numpy as np import pandas as pd -import pytest from hashprep.checks.drift import check_drift @@ -62,9 +61,7 @@ def test_no_drift_same_categories(self): issues = check_drift(train, test) - drift_issues = [ - i for i in issues if "Drift" in i.description and "cat" in i.column - ] + drift_issues = [i for i in issues if "Drift" in i.description and "cat" in i.column] assert len(drift_issues) == 0 def test_drift_different_category_distribution(self): @@ -73,9 +70,7 @@ def test_drift_different_category_distribution(self): issues = check_drift(train, test) - drift_issues = [ - i for i in issues if "Drift" in i.description and "cat" in i.column - ] + drift_issues = [i for i in issues if "Drift" in i.description and "cat" in i.column] assert len(drift_issues) >= 1 def test_new_categories_detected(self): @@ -94,9 +89,5 @@ def test_skips_high_cardinality(self): issues = check_drift(train, test) - chi2_issues = [ - i - for i in issues - if "Chi-square" in i.description and i.column == "cat" - ] + chi2_issues = [i for i in issues if "Chi-square" in i.description and i.column == "cat"] assert len(chi2_issues) == 0 diff --git a/tests/test_edge_cases.py b/tests/test_edge_cases.py new file mode 100644 index 0000000..71cf1cb --- /dev/null +++ b/tests/test_edge_cases.py @@ -0,0 +1,294 @@ +"""Tests for edge cases, failure paths, and boundary conditions.""" + +import numpy as np +import pandas as pd + +from hashprep import DatasetAnalyzer +from hashprep.checks.drift import check_drift + + +class TestEmptyDataframes: + """Test behavior with empty or minimal DataFrames.""" + + def test_empty_dataframe_analysis(self): + df = pd.DataFrame() + analyzer = DatasetAnalyzer(df, selected_checks=["empty_dataset"]) + summary = analyzer.analyze() + assert summary is not None + + def test_single_row_dataframe(self): + df = pd.DataFrame({"a": [1], "b": ["x"]}) + analyzer = DatasetAnalyzer(df) + summary = analyzer.analyze() + assert summary is not None + assert "issues" in summary + + def test_single_column_dataframe(self): + df = pd.DataFrame({"only_col": range(100)}) + analyzer = DatasetAnalyzer(df) + summary = analyzer.analyze() + assert summary is not None + + def test_all_nan_dataframe(self): + df = pd.DataFrame({"a": [np.nan] * 10, "b": [np.nan] * 10}) + analyzer = DatasetAnalyzer(df, selected_checks=["empty_dataset", "high_missing_values"]) + summary = analyzer.analyze() + assert summary["total_issues"] > 0 + + def test_all_nan_numeric_column_outliers(self): + df = pd.DataFrame({"a": [np.nan] * 10, "b": range(10)}) + analyzer = DatasetAnalyzer(df, selected_checks=["outliers"]) + summary = analyzer.analyze() + assert summary is not None + + def test_empty_drift_check(self): + issues = check_drift(pd.DataFrame(), pd.DataFrame()) + assert issues == [] + + def test_drift_with_all_nan_columns(self): + train = pd.DataFrame({"col": [np.nan] * 10}) + test = pd.DataFrame({"col": [np.nan] * 10}) + issues = check_drift(train, test) + assert isinstance(issues, list) + + +class TestConstantAndDegenerateColumns: + """Test with constant, zero-variance, and degenerate data.""" + + def test_all_zeros_column(self): + df = pd.DataFrame({"zeros": [0] * 100, "normal": range(100)}) + analyzer = DatasetAnalyzer(df, selected_checks=["outliers", "high_zero_counts"]) + summary = analyzer.analyze() + assert summary is not None + + def test_constant_numeric_column_outliers(self): + df = pd.DataFrame({"const": [42] * 100, "var": range(100)}) + analyzer = DatasetAnalyzer(df, selected_checks=["outliers", "single_value_columns"]) + summary = analyzer.analyze() + single_val_issues = [i for i in summary["issues"] if i["category"] == "single_value"] + assert len(single_val_issues) >= 1 + + def test_constant_string_column(self): + df = pd.DataFrame({"const": ["same"] * 100}) + analyzer = DatasetAnalyzer(df, selected_checks=["single_value_columns", "high_cardinality"]) + summary = analyzer.analyze() + assert summary is not None + + def test_single_category(self): + df = pd.DataFrame({"cat": ["A"] * 100, "num": range(100)}) + analyzer = DatasetAnalyzer(df, target_col="cat", selected_checks=["class_imbalance"]) + summary = analyzer.analyze() + assert summary is not None + + def test_infinite_values(self): + df = pd.DataFrame({"a": [1, 2, np.inf, -np.inf, 5], "b": range(5)}) + analyzer = DatasetAnalyzer(df, selected_checks=["infinite_values"]) + summary = analyzer.analyze() + inf_issues = [i for i in summary["issues"] if i["category"] == "infinite_values"] + assert len(inf_issues) >= 1 + + +class TestMixedAndEdgeCaseTypes: + """Test with mixed types, unusual dtypes, and edge cases.""" + + def test_mixed_types_column(self): + # Use object-typed column with numeric strings mixed with text + df = pd.DataFrame({"mixed": ["1", "two", "3.0", "four", "5"] * 20}) + analyzer = DatasetAnalyzer(df, selected_checks=["mixed_data_types"]) + summary = analyzer.analyze() + assert summary is not None + + def test_boolean_column(self): + df = pd.DataFrame({"flag": [True, False] * 50, "num": range(100)}) + analyzer = DatasetAnalyzer(df) + summary = analyzer.analyze() + assert summary is not None + + def test_datetime_column(self): + dates = pd.date_range("2020-01-01", periods=100, freq="D") + df = pd.DataFrame({"date": dates, "val": range(100)}) + analyzer = DatasetAnalyzer(df, selected_checks=["datetime_skew"]) + summary = analyzer.analyze() + assert summary is not None + + def test_very_long_strings(self): + df = pd.DataFrame({"text": ["x" * 10000] * 10 + ["short"] * 90}) + analyzer = DatasetAnalyzer(df, selected_checks=["extreme_text_lengths"]) + summary = analyzer.analyze() + assert summary is not None + + def test_empty_strings(self): + df = pd.DataFrame({"text": [""] * 50 + ["hello"] * 50}) + analyzer = DatasetAnalyzer(df, selected_checks=["extreme_text_lengths"]) + summary = analyzer.analyze() + assert summary is not None + + +class TestCorrelationEdgeCases: + """Test correlation checks with edge case data.""" + + def test_single_numeric_column_correlation(self): + df = pd.DataFrame({"x": range(100)}) + analyzer = DatasetAnalyzer(df, selected_checks=["feature_correlation"]) + summary = analyzer.analyze() + assert summary is not None + + def test_all_constant_columns_correlation(self): + df = pd.DataFrame({"a": [1] * 100, "b": [2] * 100}) + analyzer = DatasetAnalyzer(df, selected_checks=["feature_correlation"]) + summary = analyzer.analyze() + assert summary is not None + + def test_categorical_correlation_single_category(self): + df = pd.DataFrame({"cat1": ["A"] * 100, "cat2": ["X"] * 100}) + analyzer = DatasetAnalyzer(df, selected_checks=["categorical_correlation"]) + summary = analyzer.analyze() + assert summary is not None + + def test_mixed_correlation_no_variance(self): + df = pd.DataFrame({"cat": ["A", "B"] * 50, "num": [42] * 100}) + analyzer = DatasetAnalyzer(df, selected_checks=["mixed_correlation"]) + summary = analyzer.analyze() + assert summary is not None + + +class TestLeakageEdgeCases: + """Test leakage checks with edge case data.""" + + def test_leakage_target_identical_column(self): + df = pd.DataFrame({"target": [0, 1] * 50, "clone": [0, 1] * 50}) + analyzer = DatasetAnalyzer(df, target_col="target", selected_checks=["data_leakage"]) + summary = analyzer.analyze() + leakage = [i for i in summary["issues"] if i["category"] == "data_leakage"] + assert len(leakage) >= 1 + + def test_leakage_no_target(self): + df = pd.DataFrame({"a": range(100), "b": range(100)}) + analyzer = DatasetAnalyzer(df, selected_checks=["data_leakage"]) + summary = analyzer.analyze() + assert summary is not None + + def test_target_leakage_constant_feature(self): + df = pd.DataFrame( + { + "target": [0, 1] * 50, + "const": [42] * 100, + } + ) + analyzer = DatasetAnalyzer( + df, + target_col="target", + selected_checks=["target_leakage_patterns"], + ) + summary = analyzer.analyze() + assert summary is not None + + def test_categorical_target_leakage(self): + df = pd.DataFrame( + { + "target": ["yes", "no"] * 50, + "predictor": ["y", "n"] * 50, + "num": range(100), + } + ) + analyzer = DatasetAnalyzer( + df, + target_col="target", + selected_checks=["target_leakage_patterns"], + ) + summary = analyzer.analyze() + assert summary is not None + + +class TestSelectedChecksFiltering: + """Test that selected_checks properly filters checks.""" + + def test_unknown_checks_ignored(self): + df = pd.DataFrame({"a": range(100)}) + analyzer = DatasetAnalyzer(df, selected_checks=["nonexistent_check", "also_fake"]) + summary = analyzer.analyze() + assert summary["total_issues"] == 0 + + def test_empty_selected_checks(self): + df = pd.DataFrame({"a": range(100)}) + analyzer = DatasetAnalyzer(df, selected_checks=[]) + summary = analyzer.analyze() + assert summary["total_issues"] == 0 + + def test_single_check_selected(self): + df = pd.DataFrame({"a": [0] * 95 + [999] * 5}) + analyzer = DatasetAnalyzer(df, selected_checks=["outliers"]) + summary = analyzer.analyze() + for issue in summary["issues"]: + assert issue["category"] == "outliers" + + +class TestDistributionEdgeCases: + """Test distribution checks with edge case data.""" + + def test_uniform_with_few_samples(self): + df = pd.DataFrame({"x": [1, 2, 3]}) + analyzer = DatasetAnalyzer(df, selected_checks=["uniform_distribution"]) + summary = analyzer.analyze() + assert summary is not None + + def test_unique_values_few_rows(self): + df = pd.DataFrame({"x": [1, 2, 3]}) + analyzer = DatasetAnalyzer(df, selected_checks=["unique_values"]) + summary = analyzer.analyze() + assert summary is not None + + def test_skewness_constant_column(self): + df = pd.DataFrame({"a": [5] * 100}) + analyzer = DatasetAnalyzer(df, selected_checks=["skewness"]) + summary = analyzer.analyze() + assert summary is not None + + +class TestMissingPatternsEdgeCases: + """Test missing value checks with edge case data.""" + + def test_no_missing_values(self): + df = pd.DataFrame({"a": range(100), "b": range(100)}) + analyzer = DatasetAnalyzer( + df, selected_checks=["high_missing_values", "dataset_missingness", "missing_patterns"] + ) + summary = analyzer.analyze() + missing_issues = [i for i in summary["issues"] if "missing" in i["category"].lower()] + assert len(missing_issues) == 0 + + def test_completely_missing_column(self): + df = pd.DataFrame({"empty": [None] * 100, "full": range(100)}) + analyzer = DatasetAnalyzer(df, selected_checks=["high_missing_values", "empty_columns"]) + summary = analyzer.analyze() + assert summary["total_issues"] > 0 + + +class TestDriftEdgeCases: + """Test drift detection edge cases beyond basic validation.""" + + def test_drift_disjoint_columns(self): + train = pd.DataFrame({"a": [1, 2, 3]}) + test = pd.DataFrame({"b": [4, 5, 6]}) + issues = check_drift(train, test) + assert issues == [] + + def test_drift_single_value_columns(self): + train = pd.DataFrame({"x": [1.0] * 100}) + test = pd.DataFrame({"x": [1.0] * 100}) + issues = check_drift(train, test) + assert isinstance(issues, list) + + def test_drift_mixed_column_types(self): + train = pd.DataFrame({"num": [1, 2, 3] * 100, "cat": ["a", "b", "c"] * 100}) + test = pd.DataFrame({"num": [10, 20, 30] * 100, "cat": ["a", "a", "a"] * 100}) + issues = check_drift(train, test) + assert len(issues) > 0 + + def test_drift_many_categories_skipped(self): + """Chi-square should be skipped for high-cardinality categoricals.""" + train = pd.DataFrame({"cat": [f"v{i}" for i in range(200)]}) + test = pd.DataFrame({"cat": [f"v{i}" for i in range(200)]}) + issues = check_drift(train, test) + chi2 = [i for i in issues if "Chi-square" in i.description] + assert len(chi2) == 0 diff --git a/tests/test_library_api.py b/tests/test_library_api.py index 02db03e..2dfdfa8 100644 --- a/tests/test_library_api.py +++ b/tests/test_library_api.py @@ -18,14 +18,16 @@ @pytest.fixture def sample_dataframe(): """Create a sample DataFrame for testing.""" - return pd.DataFrame({ - 'id': range(1, 101), - 'category': ['A', 'B', 'C'] * 33 + ['A'], - 'value': [i * 1.5 for i in range(100)], - 'target': [0, 1] * 50, - 'missing_col': [None] * 50 + list(range(50)), - 'constant': [42] * 100, - }) + return pd.DataFrame( + { + "id": range(1, 101), + "category": ["A", "B", "C"] * 33 + ["A"], + "value": [i * 1.5 for i in range(100)], + "target": [0, 1] * 50, + "missing_col": [None] * 50 + list(range(50)), + "constant": [42] * 100, + } + ) @pytest.fixture @@ -43,25 +45,25 @@ def test_basic_analysis(self, sample_dataframe): summary = analyzer.analyze() # Check summary structure - assert 'summaries' in summary - assert 'issues' in summary - assert 'critical_count' in summary - assert 'warning_count' in summary - assert 'total_issues' in summary + assert "summaries" in summary + assert "issues" in summary + assert "critical_count" in summary + assert "warning_count" in summary + assert "total_issues" in summary # Check summaries - assert 'dataset_info' in summary['summaries'] - assert 'variables' in summary['summaries'] - assert 'missing_values' in summary['summaries'] + assert "dataset_info" in summary["summaries"] + assert "variables" in summary["summaries"] + assert "missing_values" in summary["summaries"] def test_analysis_with_target(self, sample_dataframe): """Test analysis with target column specified.""" - analyzer = DatasetAnalyzer(sample_dataframe, target_col='target') + analyzer = DatasetAnalyzer(sample_dataframe, target_col="target") summary = analyzer.analyze() # Should detect issues related to target assert summary is not None - assert 'issues' in summary + assert "issues" in summary def test_analysis_with_plots(self, sample_dataframe): """Test analysis with visualizations enabled.""" @@ -69,20 +71,17 @@ def test_analysis_with_plots(self, sample_dataframe): summary = analyzer.analyze() # Check for plots in summaries - assert 'plots' in summary['summaries'] + assert "plots" in summary["summaries"] def test_specific_checks(self, sample_dataframe): """Test running specific checks only.""" - selected_checks = ['outliers', 'duplicates', 'high_missing_values'] - analyzer = DatasetAnalyzer( - sample_dataframe, - selected_checks=selected_checks - ) + selected_checks = ["outliers", "duplicates", "high_missing_values"] + analyzer = DatasetAnalyzer(sample_dataframe, selected_checks=selected_checks) summary = analyzer.analyze() # Should only run specified checks assert summary is not None - assert 'issues' in summary + assert "issues" in summary def test_sampling(self, sample_dataframe): """Test automatic sampling for large datasets.""" @@ -90,42 +89,34 @@ def test_sampling(self, sample_dataframe): large_df = pd.concat([sample_dataframe] * 1000, ignore_index=True) sampling_config = SamplingConfig(max_rows=1000) - analyzer = DatasetAnalyzer( - large_df, - sampling_config=sampling_config, - auto_sample=True - ) + analyzer = DatasetAnalyzer(large_df, sampling_config=sampling_config, auto_sample=True) summary = analyzer.analyze() # Check if sampling occurred - if 'sampling_info' in summary: - assert summary['sampling_info']['was_sampled'] + if "sampling_info" in summary: + assert summary["sampling_info"]["was_sampled"] def test_drift_detection(self, sample_dataframe): """Test drift detection with comparison dataset.""" # Create a drifted comparison dataset comparison_df = sample_dataframe.copy() - comparison_df['value'] = comparison_df['value'] * 2 # Drift in value + comparison_df["value"] = comparison_df["value"] * 2 # Drift in value - analyzer = DatasetAnalyzer( - sample_dataframe, - comparison_df=comparison_df, - selected_checks=['dataset_drift'] - ) + analyzer = DatasetAnalyzer(sample_dataframe, comparison_df=comparison_df, selected_checks=["dataset_drift"]) summary = analyzer.analyze() # Should detect drift - drift_issues = [i for i in summary['issues'] if i['category'] == 'dataset_drift'] + drift_issues = [i for i in summary["issues"] if i["category"] == "dataset_drift"] assert len(drift_issues) > 0 def test_all_checks(self, sample_dataframe): """Test that all available checks can run.""" - analyzer = DatasetAnalyzer(sample_dataframe, target_col='target') + analyzer = DatasetAnalyzer(sample_dataframe, target_col="target") summary = analyzer.analyze() # All checks should complete without error - assert 'issues' in summary - assert isinstance(summary['issues'], list) + assert "issues" in summary + assert isinstance(summary["issues"], list) class TestReportGeneration: @@ -136,26 +127,21 @@ def test_markdown_report(self, sample_dataframe): analyzer = DatasetAnalyzer(sample_dataframe) summary = analyzer.analyze() - with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f: + with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f: output_file = f.name try: - report = generate_report( - summary, - format='md', - full=True, - output_file=output_file - ) + report = generate_report(summary, format="md", full=True, output_file=output_file) assert report is not None assert os.path.exists(output_file) assert os.path.getsize(output_file) > 0 # Check content - with open(output_file, 'r') as f: + with open(output_file) as f: content = f.read() - assert '# Dataset Quality Report' in content - assert '## Overview' in content + assert "# Dataset Quality Report" in content + assert "## Overview" in content finally: if os.path.exists(output_file): os.remove(output_file) @@ -165,27 +151,23 @@ def test_json_report(self, sample_dataframe): analyzer = DatasetAnalyzer(sample_dataframe) summary = analyzer.analyze() - with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: output_file = f.name try: - report = generate_report( - summary, - format='json', - full=True, - output_file=output_file - ) + report = generate_report(summary, format="json", full=True, output_file=output_file) assert report is not None assert os.path.exists(output_file) # Verify it's valid JSON import json - with open(output_file, 'r') as f: + + with open(output_file) as f: data = json.load(f) - assert 'metadata' in data - assert 'dataset_overview' in data - assert 'alerts' in data + assert "metadata" in data + assert "dataset_overview" in data + assert "alerts" in data finally: if os.path.exists(output_file): os.remove(output_file) @@ -195,26 +177,20 @@ def test_html_report_minimal(self, sample_dataframe): analyzer = DatasetAnalyzer(sample_dataframe, include_plots=True) summary = analyzer.analyze() - with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f: + with tempfile.NamedTemporaryFile(mode="w", suffix=".html", delete=False) as f: output_file = f.name try: - report = generate_report( - summary, - format='html', - full=True, - output_file=output_file, - theme='minimal' - ) + report = generate_report(summary, format="html", full=True, output_file=output_file, theme="minimal") assert report is not None assert os.path.exists(output_file) # Check HTML content - with open(output_file, 'r') as f: + with open(output_file) as f: content = f.read() - assert ' 0 # Check PDF magic number - with open(output_file, 'rb') as f: + with open(output_file, "rb") as f: header = f.read(4) - assert header == b'%PDF' + assert header == b"%PDF" finally: if os.path.exists(output_file): os.remove(output_file) @@ -281,112 +246,114 @@ class TestChecks: def test_missing_value_checks(self): """Test missing value detection checks.""" - df = pd.DataFrame({ - 'mostly_missing': [None] * 90 + [1] * 10, - 'some_missing': [None] * 20 + [1] * 80, - 'no_missing': range(100), - }) - - analyzer = DatasetAnalyzer( - df, - selected_checks=['high_missing_values', 'dataset_missingness'] + df = pd.DataFrame( + { + "mostly_missing": [None] * 90 + [1] * 10, + "some_missing": [None] * 20 + [1] * 80, + "no_missing": range(100), + } ) + + analyzer = DatasetAnalyzer(df, selected_checks=["high_missing_values", "dataset_missingness"]) summary = analyzer.analyze() # Should detect high missing values - missing_issues = [i for i in summary['issues'] if 'missing' in i['category'].lower()] + missing_issues = [i for i in summary["issues"] if "missing" in i["category"].lower()] assert len(missing_issues) > 0 def test_correlation_checks(self): """Test correlation detection.""" - df = pd.DataFrame({ - 'x': range(100), - 'y': [i * 2 for i in range(100)], # Highly correlated - 'z': [i ** 2 for i in range(100)], - }) - - analyzer = DatasetAnalyzer( - df, - selected_checks=['feature_correlation'] + df = pd.DataFrame( + { + "x": range(100), + "y": [i * 2 for i in range(100)], # Highly correlated + "z": [i**2 for i in range(100)], + } ) + + analyzer = DatasetAnalyzer(df, selected_checks=["feature_correlation"]) summary = analyzer.analyze() # Check that correlations were computed - assert 'numeric_correlations' in summary['summaries'] + assert "numeric_correlations" in summary["summaries"] def test_outlier_detection(self): """Test outlier detection.""" - df = pd.DataFrame({ - 'normal': range(100), - 'with_outliers': list(range(95)) + [1000, 2000, 3000, 4000, 5000], - }) + df = pd.DataFrame( + { + "normal": range(100), + "with_outliers": list(range(95)) + [1000, 2000, 3000, 4000, 5000], + } + ) - analyzer = DatasetAnalyzer(df, selected_checks=['outliers']) + analyzer = DatasetAnalyzer(df, selected_checks=["outliers"]) summary = analyzer.analyze() # Should detect outliers - outlier_issues = [i for i in summary['issues'] if i['category'] == 'outliers'] + outlier_issues = [i for i in summary["issues"] if i["category"] == "outliers"] assert len(outlier_issues) > 0 def test_duplicate_detection(self): """Test duplicate row detection.""" - df = pd.DataFrame({ - 'a': [1, 2, 3, 1, 2], - 'b': [4, 5, 6, 4, 5], - }) + df = pd.DataFrame( + { + "a": [1, 2, 3, 1, 2], + "b": [4, 5, 6, 4, 5], + } + ) - analyzer = DatasetAnalyzer(df, selected_checks=['duplicates']) + analyzer = DatasetAnalyzer(df, selected_checks=["duplicates"]) summary = analyzer.analyze() # Check duplicate info in dataset_info - assert summary['summaries']['dataset_info']['duplicate_rows'] == 2 + assert summary["summaries"]["dataset_info"]["duplicate_rows"] == 2 def test_cardinality_checks(self): """Test high cardinality detection.""" - df = pd.DataFrame({ - 'high_card': [f'value_{i}' for i in range(1000)], # Need more unique values - 'low_card': ['A', 'B'] * 500, - 'feature': range(1000), - }) + df = pd.DataFrame( + { + "high_card": [f"value_{i}" for i in range(1000)], # Need more unique values + "low_card": ["A", "B"] * 500, + "feature": range(1000), + } + ) - analyzer = DatasetAnalyzer(df, selected_checks=['high_cardinality']) + analyzer = DatasetAnalyzer(df, selected_checks=["high_cardinality"]) summary = analyzer.analyze() # Should detect high cardinality - card_issues = [i for i in summary['issues'] if i['category'] == 'high_cardinality'] + card_issues = [i for i in summary["issues"] if i["category"] == "high_cardinality"] assert len(card_issues) >= 0 # May or may not detect depending on thresholds def test_constant_column_detection(self): """Test single value column detection.""" - df = pd.DataFrame({ - 'constant': [42] * 100, - 'variable': range(100), - }) + df = pd.DataFrame( + { + "constant": [42] * 100, + "variable": range(100), + } + ) - analyzer = DatasetAnalyzer(df, selected_checks=['single_value_columns']) - summary = analyzer.analyze() + analyzer = DatasetAnalyzer(df, selected_checks=["single_value_columns"]) + analyzer.analyze() - # Should detect constant column - constant_issues = [i for i in summary['issues'] if i['category'] == 'single_value_columns'] # Verify the check ran (may or may not generate issue depending on column type inference) - assert 'single_value_columns' in DatasetAnalyzer.ALL_CHECKS + assert "single_value_columns" in DatasetAnalyzer.ALL_CHECKS def test_class_imbalance_detection(self): """Test class imbalance detection.""" - df = pd.DataFrame({ - 'target': [0] * 95 + [1] * 5, - 'feature': range(100), - }) - - analyzer = DatasetAnalyzer( - df, - target_col='target', - selected_checks=['class_imbalance'] + df = pd.DataFrame( + { + "target": [0] * 95 + [1] * 5, + "feature": range(100), + } ) + + analyzer = DatasetAnalyzer(df, target_col="target", selected_checks=["class_imbalance"]) summary = analyzer.analyze() # Should detect imbalance - imbalance_issues = [i for i in summary['issues'] if i['category'] == 'class_imbalance'] + imbalance_issues = [i for i in summary["issues"] if i["category"] == "class_imbalance"] assert len(imbalance_issues) > 0 @@ -399,18 +366,18 @@ def test_titanic_full_analysis(self, titanic_csv): pytest.skip("Titanic dataset not found") df = pd.read_csv(titanic_csv) - analyzer = DatasetAnalyzer(df, target_col='Survived', include_plots=True) + analyzer = DatasetAnalyzer(df, target_col="Survived", include_plots=True) summary = analyzer.analyze() # Verify complete analysis assert summary is not None - assert len(summary['issues']) > 0 - assert 'plots' in summary['summaries'] + assert len(summary["issues"]) > 0 + assert "plots" in summary["summaries"] # Verify key issues are detected - categories = {issue['category'] for issue in summary['issues']} + categories = {issue["category"] for issue in summary["issues"]} # Cabin has 77% missing - check was renamed to just 'missing_values' in some versions - assert 'high_missing_values' in categories or any('missing' in cat for cat in categories) + assert "high_missing_values" in categories or any("missing" in cat for cat in categories) def test_titanic_all_report_formats(self, titanic_csv): """Test generating all report formats for Titanic.""" @@ -421,27 +388,16 @@ def test_titanic_all_report_formats(self, titanic_csv): analyzer = DatasetAnalyzer(df, include_plots=True) summary = analyzer.analyze() - formats = ['md', 'json', 'html', 'pdf'] + formats = ["md", "json", "html", "pdf"] for fmt in formats: - with tempfile.NamedTemporaryFile(mode='w', suffix=f'.{fmt}', delete=False) as f: + with tempfile.NamedTemporaryFile(mode="w", suffix=f".{fmt}", delete=False) as f: output_file = f.name try: - if fmt == 'html': - report = generate_report( - summary, - format=fmt, - full=True, - output_file=output_file, - theme='minimal' - ) + if fmt == "html": + report = generate_report(summary, format=fmt, full=True, output_file=output_file, theme="minimal") else: - report = generate_report( - summary, - format=fmt, - full=True, - output_file=output_file - ) + report = generate_report(summary, format=fmt, full=True, output_file=output_file) assert report is not None assert os.path.exists(output_file) @@ -451,5 +407,5 @@ def test_titanic_all_report_formats(self, titanic_csv): os.remove(output_file) -if __name__ == '__main__': - pytest.main([__file__, '-v']) +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/test_sampling.py b/tests/test_sampling.py index 25d5091..4a30c47 100644 --- a/tests/test_sampling.py +++ b/tests/test_sampling.py @@ -1,8 +1,6 @@ """Tests for dataset sampling module.""" -import numpy as np import pandas as pd -import pytest from hashprep.utils.sampling import DatasetSampler, SamplingConfig @@ -62,12 +60,8 @@ def test_head_sampling(self): assert list(result["col"]) == list(range(100)) def test_stratified_sampling_preserves_proportions(self): - df = pd.DataFrame( - {"feature": range(1000), "label": ["A"] * 900 + ["B"] * 100} - ) - config = SamplingConfig( - max_rows=100, sample_method="stratified", stratify_column="label" - ) + df = pd.DataFrame({"feature": range(1000), "label": ["A"] * 900 + ["B"] * 100}) + config = SamplingConfig(max_rows=100, sample_method="stratified", stratify_column="label") sampler = DatasetSampler(config) result = sampler.sample(df) @@ -112,4 +106,4 @@ def test_should_not_sample_small_dataset(self): config = SamplingConfig(max_rows=1000) sampler = DatasetSampler(config) - assert sampler.should_sample(df) == False + assert not sampler.should_sample(df) diff --git a/uv.lock b/uv.lock index 674d497..f3ff09e 100644 --- a/uv.lock +++ b/uv.lock @@ -494,6 +494,7 @@ dependencies = [ [package.dev-dependencies] dev = [ { name = "pytest" }, + { name = "ruff" }, ] [package.metadata] @@ -514,7 +515,10 @@ requires-dist = [ ] [package.metadata.requires-dev] -dev = [{ name = "pytest", specifier = ">=9.0.2" }] +dev = [ + { name = "pytest", specifier = ">=9.0.2" }, + { name = "ruff", specifier = ">=0.8.0" }, +] [[package]] name = "iniconfig" @@ -1234,6 +1238,31 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fa/de/02b54f42487e3d3c6efb3f89428677074ca7bf43aae402517bc7cca949f3/PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563", size = 156446, upload-time = "2024-08-06T20:33:04.33Z" }, ] +[[package]] +name = "ruff" +version = "0.15.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/04/dc/4e6ac71b511b141cf626357a3946679abeba4cf67bc7cc5a17920f31e10d/ruff-0.15.1.tar.gz", hash = "sha256:c590fe13fb57c97141ae975c03a1aedb3d3156030cabd740d6ff0b0d601e203f", size = 4540855, upload-time = "2026-02-12T23:09:09.998Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/23/bf/e6e4324238c17f9d9120a9d60aa99a7daaa21204c07fcd84e2ef03bb5fd1/ruff-0.15.1-py3-none-linux_armv6l.whl", hash = "sha256:b101ed7cf4615bda6ffe65bdb59f964e9f4a0d3f85cbf0e54f0ab76d7b90228a", size = 10367819, upload-time = "2026-02-12T23:09:03.598Z" }, + { url = "https://files.pythonhosted.org/packages/b3/ea/c8f89d32e7912269d38c58f3649e453ac32c528f93bb7f4219258be2e7ed/ruff-0.15.1-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:939c995e9277e63ea632cc8d3fae17aa758526f49a9a850d2e7e758bfef46602", size = 10798618, upload-time = "2026-02-12T23:09:22.928Z" }, + { url = "https://files.pythonhosted.org/packages/5e/0f/1d0d88bc862624247d82c20c10d4c0f6bb2f346559d8af281674cf327f15/ruff-0.15.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:1d83466455fdefe60b8d9c8df81d3c1bbb2115cede53549d3b522ce2bc703899", size = 10148518, upload-time = "2026-02-12T23:08:58.339Z" }, + { url = "https://files.pythonhosted.org/packages/f5/c8/291c49cefaa4a9248e986256df2ade7add79388fe179e0691be06fae6f37/ruff-0.15.1-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9457e3c3291024866222b96108ab2d8265b477e5b1534c7ddb1810904858d16", size = 10518811, upload-time = "2026-02-12T23:09:31.865Z" }, + { url = "https://files.pythonhosted.org/packages/c3/1a/f5707440e5ae43ffa5365cac8bbb91e9665f4a883f560893829cf16a606b/ruff-0.15.1-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:92c92b003e9d4f7fbd33b1867bb15a1b785b1735069108dfc23821ba045b29bc", size = 10196169, upload-time = "2026-02-12T23:09:17.306Z" }, + { url = "https://files.pythonhosted.org/packages/2a/ff/26ddc8c4da04c8fd3ee65a89c9fb99eaa5c30394269d424461467be2271f/ruff-0.15.1-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1fe5c41ab43e3a06778844c586251eb5a510f67125427625f9eb2b9526535779", size = 10990491, upload-time = "2026-02-12T23:09:25.503Z" }, + { url = "https://files.pythonhosted.org/packages/fc/00/50920cb385b89413f7cdb4bb9bc8fc59c1b0f30028d8bccc294189a54955/ruff-0.15.1-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:66a6dd6df4d80dc382c6484f8ce1bcceb55c32e9f27a8b94c32f6c7331bf14fb", size = 11843280, upload-time = "2026-02-12T23:09:19.88Z" }, + { url = "https://files.pythonhosted.org/packages/5d/6d/2f5cad8380caf5632a15460c323ae326f1e1a2b5b90a6ee7519017a017ca/ruff-0.15.1-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6a4a42cbb8af0bda9bcd7606b064d7c0bc311a88d141d02f78920be6acb5aa83", size = 11274336, upload-time = "2026-02-12T23:09:14.907Z" }, + { url = "https://files.pythonhosted.org/packages/a3/1d/5f56cae1d6c40b8a318513599b35ea4b075d7dc1cd1d04449578c29d1d75/ruff-0.15.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4ab064052c31dddada35079901592dfba2e05f5b1e43af3954aafcbc1096a5b2", size = 11137288, upload-time = "2026-02-12T23:09:07.475Z" }, + { url = "https://files.pythonhosted.org/packages/cd/20/6f8d7d8f768c93b0382b33b9306b3b999918816da46537d5a61635514635/ruff-0.15.1-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:5631c940fe9fe91f817a4c2ea4e81f47bee3ca4aa646134a24374f3c19ad9454", size = 11070681, upload-time = "2026-02-12T23:08:55.43Z" }, + { url = "https://files.pythonhosted.org/packages/9a/67/d640ac76069f64cdea59dba02af2e00b1fa30e2103c7f8d049c0cff4cafd/ruff-0.15.1-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:68138a4ba184b4691ccdc39f7795c66b3c68160c586519e7e8444cf5a53e1b4c", size = 10486401, upload-time = "2026-02-12T23:09:27.927Z" }, + { url = "https://files.pythonhosted.org/packages/65/3d/e1429f64a3ff89297497916b88c32a5cc88eeca7e9c787072d0e7f1d3e1e/ruff-0.15.1-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:518f9af03bfc33c03bdb4cb63fabc935341bb7f54af500f92ac309ecfbba6330", size = 10197452, upload-time = "2026-02-12T23:09:12.147Z" }, + { url = "https://files.pythonhosted.org/packages/78/83/e2c3bade17dad63bf1e1c2ffaf11490603b760be149e1419b07049b36ef2/ruff-0.15.1-py3-none-musllinux_1_2_i686.whl", hash = "sha256:da79f4d6a826caaea95de0237a67e33b81e6ec2e25fc7e1993a4015dffca7c61", size = 10693900, upload-time = "2026-02-12T23:09:34.418Z" }, + { url = "https://files.pythonhosted.org/packages/a1/27/fdc0e11a813e6338e0706e8b39bb7a1d61ea5b36873b351acee7e524a72a/ruff-0.15.1-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:3dd86dccb83cd7d4dcfac303ffc277e6048600dfc22e38158afa208e8bf94a1f", size = 11227302, upload-time = "2026-02-12T23:09:36.536Z" }, + { url = "https://files.pythonhosted.org/packages/f6/58/ac864a75067dcbd3b95be5ab4eb2b601d7fbc3d3d736a27e391a4f92a5c1/ruff-0.15.1-py3-none-win32.whl", hash = "sha256:660975d9cb49b5d5278b12b03bb9951d554543a90b74ed5d366b20e2c57c2098", size = 10462555, upload-time = "2026-02-12T23:09:29.899Z" }, + { url = "https://files.pythonhosted.org/packages/e0/5e/d4ccc8a27ecdb78116feac4935dfc39d1304536f4296168f91ed3ec00cd2/ruff-0.15.1-py3-none-win_amd64.whl", hash = "sha256:c820fef9dd5d4172a6570e5721704a96c6679b80cf7be41659ed439653f62336", size = 11599956, upload-time = "2026-02-12T23:09:01.157Z" }, + { url = "https://files.pythonhosted.org/packages/2a/07/5bda6a85b220c64c65686bc85bd0bbb23b29c62b3a9f9433fa55f17cda93/ruff-0.15.1-py3-none-win_arm64.whl", hash = "sha256:5ff7d5f0f88567850f45081fac8f4ec212be8d0b963e385c3f7d0d2eb4899416", size = 10874604, upload-time = "2026-02-12T23:09:05.515Z" }, +] + [[package]] name = "scikit-learn" version = "1.7.2"