Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
name: CI

on:
push:
branches: [main]
pull_request:
branches: [main]

jobs:
test:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.10", "3.11", "3.12"]

steps:
- uses: actions/checkout@v4

- name: Install uv
uses: astral-sh/setup-uv@v4

- name: Set up Python ${{ matrix.python-version }}
run: uv python install ${{ matrix.python-version }}

- name: Install dependencies
run: uv sync --dev

- name: Run tests
run: uv run pytest tests/ -v --tb=short

lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Install uv
uses: astral-sh/setup-uv@v4

- name: Set up Python
run: uv python install 3.12

- name: Install dependencies
run: uv sync --dev

- name: Run ruff check
run: uv run ruff check .

- name: Run ruff format check
run: uv run ruff format --check .
35 changes: 16 additions & 19 deletions examples/reports/train_hashprep_report_fixes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,6 @@
Review and adapt before production use.
"""

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import RobustScaler
import numpy as np
import pandas as pd


Expand All @@ -14,51 +11,51 @@ def apply_fixes(df):
df = df.copy()

# Column 'Cabin' has 77% missing values
df = df.drop(columns=['Cabin'])
df = df.drop(columns=["Cabin"])

# Frequency encode high-cardinality column 'Name'
freq_Name = df['Name'].value_counts(normalize=True)
df['Name_encoded'] = df['Name'].map(freq_Name)
freq_Name = df["Name"].value_counts(normalize=True)
df["Name_encoded"] = df["Name"].map(freq_Name)

# Frequency encode high-cardinality column 'Ticket'
freq_Ticket = df['Ticket'].value_counts(normalize=True)
df['Ticket_encoded'] = df['Ticket'].map(freq_Ticket)
freq_Ticket = df["Ticket"].value_counts(normalize=True)
df["Ticket_encoded"] = df["Ticket"].map(freq_Ticket)

# Clip outliers in 'Fare' using IQR method
q1_Fare, q3_Fare = df['Fare'].quantile([0.25, 0.75])
q1_Fare, q3_Fare = df["Fare"].quantile([0.25, 0.75])
iqr_Fare = q3_Fare - q1_Fare
lower_Fare, upper_Fare = q1_Fare - 1.5 * iqr_Fare, q3_Fare + 1.5 * iqr_Fare
df['Fare'] = df['Fare'].clip(lower=lower_Fare, upper=upper_Fare)
df["Fare"] = df["Fare"].clip(lower=lower_Fare, upper=upper_Fare)

# Clip outliers in 'Parch' using IQR method
q1_Parch, q3_Parch = df['Parch'].quantile([0.25, 0.75])
q1_Parch, q3_Parch = df["Parch"].quantile([0.25, 0.75])
iqr_Parch = q3_Parch - q1_Parch
lower_Parch, upper_Parch = q1_Parch - 1.5 * iqr_Parch, q3_Parch + 1.5 * iqr_Parch
df['Parch'] = df['Parch'].clip(lower=lower_Parch, upper=upper_Parch)
df["Parch"] = df["Parch"].clip(lower=lower_Parch, upper=upper_Parch)

# Clip outliers in 'SibSp' using IQR method
q1_SibSp, q3_SibSp = df['SibSp'].quantile([0.25, 0.75])
q1_SibSp, q3_SibSp = df["SibSp"].quantile([0.25, 0.75])
iqr_SibSp = q3_SibSp - q1_SibSp
lower_SibSp, upper_SibSp = q1_SibSp - 1.5 * iqr_SibSp, q3_SibSp + 1.5 * iqr_SibSp
df['SibSp'] = df['SibSp'].clip(lower=lower_SibSp, upper=upper_SibSp)
df["SibSp"] = df["SibSp"].clip(lower=lower_SibSp, upper=upper_SibSp)

# Drop highly correlated column 'Survived,Sex'
df = df.drop(columns=['Survived,Sex'])
df = df.drop(columns=["Survived,Sex"])

return df


if __name__ == '__main__':
if __name__ == "__main__":
import sys

if len(sys.argv) < 2:
print('Usage: python fixes.py <input.csv> [output.csv]')
print("Usage: python fixes.py <input.csv> [output.csv]")
sys.exit(1)

input_file = sys.argv[1]
output_file = sys.argv[2] if len(sys.argv) > 2 else 'cleaned_data.csv'
output_file = sys.argv[2] if len(sys.argv) > 2 else "cleaned_data.csv"

df = pd.read_csv(input_file)
df_clean = apply_fixes(df)
df_clean.to_csv(output_file, index=False)
print(f'Cleaned data saved to {output_file}')
print(f"Cleaned data saved to {output_file}")
30 changes: 14 additions & 16 deletions examples/reports/train_hashprep_report_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,27 @@

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import numpy as np


def build_preprocessing_pipeline():
"""Build sklearn preprocessing pipeline."""

transformers = [
('drop_column_Cabin', 'drop', ['Cabin']),
('drop_column_Survived,S', 'drop', ['Survived,Sex']),
("drop_column_Cabin", "drop", ["Cabin"]),
("drop_column_Survived,S", "drop", ["Survived,Sex"]),
]

preprocessor = ColumnTransformer(
transformers=transformers,
remainder='passthrough',
remainder="passthrough",
verbose_feature_names_out=False,
)

pipeline = Pipeline([
('preprocessor', preprocessor),
])
pipeline = Pipeline(
[
("preprocessor", preprocessor),
]
)

return pipeline

Expand All @@ -37,20 +37,18 @@ def get_pre_pipeline_steps():
"""
steps = []
# Outlier clipping for ['Fare']
steps.append(('clip_outliers_Fare', None)) # Implement manually
steps.append(("clip_outliers_Fare", None)) # Implement manually
# Outlier clipping for ['Parch']
steps.append(('clip_outliers_Parch', None)) # Implement manually
steps.append(("clip_outliers_Parch", None)) # Implement manually
# Outlier clipping for ['SibSp']
steps.append(('clip_outliers_SibSp', None)) # Implement manually
steps.append(("clip_outliers_SibSp", None)) # Implement manually
return steps


if __name__ == '__main__':
import joblib

if __name__ == "__main__":
pipeline = build_preprocessing_pipeline()
if pipeline:
print('Pipeline created successfully')
print("Pipeline created successfully")
print(pipeline)
# Example: Save pipeline
# joblib.dump(pipeline, 'preprocessing_pipeline.joblib')
# joblib.dump(pipeline, 'preprocessing_pipeline.joblib')
4 changes: 2 additions & 2 deletions hashprep/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .core.analyzer import DatasetAnalyzer
from .core.analyzer import DatasetAnalyzer as DatasetAnalyzer

__version__ = "0.1.0b1"
__version__ = "0.1.0b1"
38 changes: 20 additions & 18 deletions hashprep/checks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,31 @@
from typing import List, Optional

from .core import Issue
from .columns import _check_duplicates, _check_high_cardinality, _check_mixed_data_types, _check_single_value_columns
from .core import Issue as Issue
from .correlations import calculate_correlations
from .distribution import _check_uniform_distribution, _check_unique_values
from .drift import check_drift
from .imbalance import _check_class_imbalance
from .leakage import _check_data_leakage, _check_target_leakage_patterns
from .missing_values import _check_high_missing_values, _check_empty_columns, _check_dataset_missingness, \
_check_missing_patterns
from .columns import _check_single_value_columns, _check_high_cardinality, _check_duplicates, _check_mixed_data_types
from .missing_values import (
_check_dataset_missingness,
_check_empty_columns,
_check_high_missing_values,
_check_missing_patterns,
)
from .outliers import (
_check_outliers,
_check_high_zero_counts,
_check_extreme_text_lengths,
_check_datetime_skew,
_check_skewness,
_check_infinite_values,
_check_constant_length,
_check_datetime_skew,
_check_empty_dataset,
_check_extreme_text_lengths,
_check_high_zero_counts,
_check_infinite_values,
_check_outliers,
_check_skewness,
)
from .correlations import calculate_correlations
from .imbalance import _check_class_imbalance
from .distribution import _check_uniform_distribution, _check_unique_values


def _check_dataset_drift(analyzer):
"""Wrapper for drift detection that uses analyzer's comparison_df."""
if hasattr(analyzer, 'comparison_df') and analyzer.comparison_df is not None:
if hasattr(analyzer, "comparison_df") and analyzer.comparison_df is not None:
return check_drift(analyzer.df, analyzer.comparison_df)
return []

Expand Down Expand Up @@ -56,7 +58,7 @@ def _check_dataset_drift(analyzer):
CORRELATION_CHECKS = {"feature_correlation", "categorical_correlation", "mixed_correlation"}


def run_checks(analyzer, checks_to_run: List[str]):
def run_checks(analyzer, checks_to_run: list[str]):
issues = []
correlation_requested = False

Expand All @@ -70,4 +72,4 @@ def run_checks(analyzer, checks_to_run: List[str]):
if correlation_requested:
issues.extend(calculate_correlations(analyzer))

return issues
return issues
14 changes: 11 additions & 3 deletions hashprep/checks/columns.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from .core import Issue
from ..config import DEFAULT_CONFIG
from .core import Issue

_COL_THRESHOLDS = DEFAULT_CONFIG.columns


def _check_single_value_columns(analyzer):
issues = []
for col in analyzer.df.columns:
Expand All @@ -26,7 +27,12 @@ def _check_single_value_columns(analyzer):
)
return issues

def _check_high_cardinality(analyzer, threshold: int = _COL_THRESHOLDS.high_cardinality_count, critical_threshold: float = _COL_THRESHOLDS.high_cardinality_ratio_critical):

def _check_high_cardinality(
analyzer,
threshold: int = _COL_THRESHOLDS.high_cardinality_count,
critical_threshold: float = _COL_THRESHOLDS.high_cardinality_ratio_critical,
):
issues = []
categorical_cols = analyzer.df.select_dtypes(include="object").columns.tolist()
for col in categorical_cols:
Expand All @@ -52,6 +58,7 @@ def _check_high_cardinality(analyzer, threshold: int = _COL_THRESHOLDS.high_card
)
return issues


def _check_duplicates(analyzer):
issues = []
duplicate_rows = int(analyzer.df.duplicated().sum())
Expand All @@ -76,6 +83,7 @@ def _check_duplicates(analyzer):
)
return issues


def _check_mixed_data_types(analyzer):
issues = []
for col in analyzer.df.columns:
Expand All @@ -91,4 +99,4 @@ def _check_mixed_data_types(analyzer):
quick_fix="Options: \n- Cast to single type: Ensure consistency (Pros: Simplifies processing; Cons: May lose nuance).\n- Split column: Separate types into new features (Pros: Preserves info; Cons: Adds complexity).\n- Investigate source: Check data collection errors (Pros: Improves quality; Cons: Time-consuming).",
)
)
return issues
return issues
3 changes: 1 addition & 2 deletions hashprep/checks/core.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from dataclasses import dataclass

@dataclass

@dataclass
class Issue:

category: str

severity: str # critical or warning
Expand Down
Loading