"""Data Integrity Audit Router - Powered by DuckDB""" from fastapi import APIRouter, UploadFile, File, HTTPException from fastapi.responses import StreamingResponse from pydantic import BaseModel from typing import Optional import duckdb import io import json import tempfile import os router = APIRouter() class ColumnStats(BaseModel): name: str dtype: str missing_count: int missing_percent: float unique_count: int sample_values: list min_value: Optional[str] = None max_value: Optional[str] = None mean_value: Optional[float] = None std_value: Optional[float] = None class AuditResult(BaseModel): total_rows: int total_columns: int missing_values: dict duplicate_rows: int duplicate_percent: float column_stats: list[ColumnStats] issues: list[str] recommendations: list[str] class CleaningConfig(BaseModel): remove_duplicates: bool = True fill_missing: Optional[str] = None # mean, median, mode, drop, value fill_value: Optional[str] = None remove_outliers: bool = False outlier_method: str = "iqr" # iqr, zscore outlier_threshold: float = 1.5 async def read_to_duckdb(file: UploadFile) -> tuple[duckdb.DuckDBPyConnection, str]: """Read uploaded file into DuckDB in-memory database""" content = await file.read() filename = file.filename.lower() if file.filename else "file.csv" # Create in-memory DuckDB connection conn = duckdb.connect(":memory:") # Determine file suffix if filename.endswith('.csv'): suffix = '.csv' elif filename.endswith('.json'): suffix = '.json' elif filename.endswith('.xlsx'): suffix = '.xlsx' elif filename.endswith('.xls'): suffix = '.xls' else: suffix = '.csv' with tempfile.NamedTemporaryFile(mode='wb', suffix=suffix, delete=False) as tmp: tmp.write(content) tmp_path = tmp.name try: if filename.endswith('.csv'): conn.execute(f"CREATE TABLE data AS SELECT * FROM read_csv_auto('{tmp_path}')") elif filename.endswith('.json'): conn.execute(f"CREATE TABLE data AS SELECT * FROM read_json_auto('{tmp_path}')") elif filename.endswith(('.xls', '.xlsx')): # Use DuckDB's spatial extension for Excel or the xlsx reader try: # Try st_read first (requires spatial extension) conn.execute(f"CREATE TABLE data AS SELECT * FROM st_read('{tmp_path}')") except: # Fallback to xlsx reader if available conn.execute(f"CREATE TABLE data AS SELECT * FROM read_xlsx('{tmp_path}')") else: # Default to CSV conn.execute(f"CREATE TABLE data AS SELECT * FROM read_csv_auto('{tmp_path}')") finally: os.unlink(tmp_path) return conn, "data" @router.post("/analyze") async def analyze_data(file: UploadFile = File(...)): """Analyze a dataset for integrity issues using DuckDB""" try: conn, table_name = await read_to_duckdb(file) except Exception as e: raise HTTPException(status_code=400, detail=f"Could not read file: {str(e)}") try: # Get basic stats using DuckDB total_rows = conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0] # Get column info columns_info = conn.execute(f"DESCRIBE {table_name}").fetchall() column_names = [col[0] for col in columns_info] column_types = {col[0]: col[1] for col in columns_info} total_columns = len(column_names) # Missing values analysis using DuckDB SQL missing_values = {} for col in column_names: missing_count = conn.execute(f''' SELECT COUNT(*) - COUNT("{col}") as missing FROM {table_name} ''').fetchone()[0] if missing_count > 0: missing_values[col] = { "count": int(missing_count), "percent": round(missing_count / total_rows * 100, 2) } # Duplicate rows using DuckDB duplicate_query = f''' SELECT COUNT(*) as dup_count FROM ( SELECT *, COUNT(*) OVER (PARTITION BY {', '.join([f'"{c}"' for c in column_names])}) as cnt FROM {table_name} ) WHERE cnt > 1 ''' try: duplicate_rows = conn.execute(duplicate_query).fetchone()[0] except: # Fallback for complex cases duplicate_rows = 0 duplicate_percent = round(duplicate_rows / total_rows * 100, 2) if total_rows > 0 else 0 # Column statistics using DuckDB column_stats = [] for col in column_names: col_type = column_types[col] # Get missing count missing_count = conn.execute(f''' SELECT COUNT(*) - COUNT("{col}") FROM {table_name} ''').fetchone()[0] missing_percent = round(missing_count / total_rows * 100, 2) if total_rows > 0 else 0 # Get unique count unique_count = conn.execute(f''' SELECT COUNT(DISTINCT "{col}") FROM {table_name} ''').fetchone()[0] # Get sample values samples = conn.execute(f''' SELECT DISTINCT "{col}" FROM {table_name} WHERE "{col}" IS NOT NULL LIMIT 5 ''').fetchall() sample_values = [str(s[0]) for s in samples] # Get min/max/mean/std for numeric columns min_val, max_val, mean_val, std_val = None, None, None, None if 'INT' in col_type.upper() or 'DOUBLE' in col_type.upper() or 'FLOAT' in col_type.upper() or 'DECIMAL' in col_type.upper() or 'BIGINT' in col_type.upper(): stats = conn.execute(f''' SELECT MIN("{col}"), MAX("{col}"), AVG("{col}"), STDDEV("{col}") FROM {table_name} ''').fetchone() min_val = str(stats[0]) if stats[0] is not None else None max_val = str(stats[1]) if stats[1] is not None else None mean_val = round(float(stats[2]), 4) if stats[2] is not None else None std_val = round(float(stats[3]), 4) if stats[3] is not None else None column_stats.append(ColumnStats( name=col, dtype=col_type, missing_count=int(missing_count), missing_percent=missing_percent, unique_count=int(unique_count), sample_values=sample_values, min_value=min_val, max_value=max_val, mean_value=mean_val, std_value=std_val )) # Generate issues and recommendations issues = [] recommendations = [] # Check for missing values total_missing = sum(mv["count"] for mv in missing_values.values()) if total_missing > 0: issues.append(f"Dataset has {total_missing:,} missing values across {len(missing_values)} columns") recommendations.append("Consider filling missing values with mean/median for numeric columns or mode for categorical") # Check for duplicates if duplicate_rows > 0: issues.append(f"Found {duplicate_rows:,} duplicate rows ({duplicate_percent}%)") recommendations.append("Consider removing duplicate rows to improve data quality") # Check for high cardinality columns for col in column_names: unique_count = conn.execute(f'SELECT COUNT(DISTINCT "{col}") FROM {table_name}').fetchone()[0] unique_ratio = unique_count / total_rows if total_rows > 0 else 0 col_type = column_types[col] if unique_ratio > 0.9 and 'VARCHAR' in col_type.upper(): issues.append(f"Column '{col}' has very high cardinality ({unique_count:,} unique values)") recommendations.append(f"Review if '{col}' should be used as an identifier rather than a feature") # Check for constant columns for col in column_names: unique_count = conn.execute(f'SELECT COUNT(DISTINCT "{col}") FROM {table_name}').fetchone()[0] if unique_count == 1: issues.append(f"Column '{col}' has only one unique value") recommendations.append(f"Consider removing constant column '{col}'") # Check for outliers in numeric columns using DuckDB outlier_columns = [] total_outlier_count = 0 for col in column_names: col_type = column_types[col] if 'INT' in col_type.upper() or 'DOUBLE' in col_type.upper() or 'FLOAT' in col_type.upper() or 'DECIMAL' in col_type.upper() or 'BIGINT' in col_type.upper(): # Calculate IQR using DuckDB quartiles = conn.execute(f''' SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{col}") as q1, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{col}") as q3 FROM {table_name} WHERE "{col}" IS NOT NULL ''').fetchone() if quartiles[0] is not None and quartiles[1] is not None: q1, q3 = float(quartiles[0]), float(quartiles[1]) iqr = q3 - q1 lower_bound = q1 - 1.5 * iqr upper_bound = q3 + 1.5 * iqr outlier_count = conn.execute(f''' SELECT COUNT(*) FROM {table_name} WHERE "{col}" < {lower_bound} OR "{col}" > {upper_bound} ''').fetchone()[0] if outlier_count > 0: outlier_pct = round(outlier_count / total_rows * 100, 1) issues.append(f"Column '{col}' has {outlier_count:,} potential outliers ({outlier_pct}%)") outlier_columns.append(col) total_outlier_count += outlier_count # Add outlier recommendations if outlier_columns: if total_outlier_count > total_rows * 0.1: recommendations.append(f"High outlier rate detected. Review data collection process for columns: {', '.join(outlier_columns[:5])}") recommendations.append("Consider using robust scalers (RobustScaler) or winsorization for outlier-heavy columns") if len(outlier_columns) > 3: recommendations.append(f"Multiple columns ({len(outlier_columns)}) have outliers - consider domain-specific thresholds instead of IQR") if not issues: issues.append("No major data quality issues detected") recommendations.append("Dataset appears to be clean") return { "total_rows": total_rows, "total_columns": total_columns, "missing_values": missing_values, "duplicate_rows": int(duplicate_rows), "duplicate_percent": duplicate_percent, "column_stats": [cs.model_dump() for cs in column_stats], "issues": issues, "recommendations": recommendations, "engine": "DuckDB" # Indicate we're using DuckDB } finally: conn.close() @router.post("/analyze-duckdb") async def analyze_with_sql(file: UploadFile = File(...), query: Optional[str] = None): """Run custom SQL analysis on uploaded data using DuckDB""" try: conn, table_name = await read_to_duckdb(file) except Exception as e: raise HTTPException(status_code=400, detail=f"Could not read file: {str(e)}") try: if query: # Run custom query (replace 'data' with actual table name) safe_query = query.replace("FROM data", f"FROM {table_name}").replace("from data", f"FROM {table_name}") # Get column names from description desc = conn.execute(f"DESCRIBE ({safe_query})").fetchall() columns = [col[0] for col in desc] # Fetch data as list of tuples rows = conn.execute(safe_query).fetchall() # Convert to list of dicts data = [dict(zip(columns, row)) for row in rows] return { "columns": columns, "data": data, "row_count": len(rows) } else: # Return summary using DuckDB SUMMARIZE desc = conn.execute(f"DESCRIBE (SUMMARIZE {table_name})").fetchall() columns = [col[0] for col in desc] rows = conn.execute(f"SUMMARIZE {table_name}").fetchall() data = [dict(zip(columns, row)) for row in rows] return { "columns": columns, "data": data, "row_count": len(rows) } finally: conn.close() @router.post("/clean") async def clean_data(file: UploadFile = File(...)): """Clean a dataset using DuckDB""" try: conn, table_name = await read_to_duckdb(file) except Exception as e: raise HTTPException(status_code=400, detail=f"Could not read file: {str(e)}") try: original_rows = conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0] changes = [] # Get column names columns_info = conn.execute(f"DESCRIBE {table_name}").fetchall() column_names = [col[0] for col in columns_info] # Remove duplicates using DuckDB conn.execute(f''' CREATE TABLE cleaned AS SELECT DISTINCT * FROM {table_name} ''') rows_after_dedup = conn.execute("SELECT COUNT(*) FROM cleaned").fetchone()[0] duplicates_removed = original_rows - rows_after_dedup if duplicates_removed > 0: changes.append(f"Removed {duplicates_removed:,} duplicate rows") # Count rows with any NULL values null_conditions = " OR ".join([f'"{col}" IS NULL' for col in column_names]) rows_with_nulls = conn.execute(f''' SELECT COUNT(*) FROM cleaned WHERE {null_conditions} ''').fetchone()[0] # Remove rows with NULL values not_null_conditions = " AND ".join([f'"{col}" IS NOT NULL' for col in column_names]) conn.execute(f''' CREATE TABLE final_cleaned AS SELECT * FROM cleaned WHERE {not_null_conditions} ''') cleaned_rows = conn.execute("SELECT COUNT(*) FROM final_cleaned").fetchone()[0] rows_dropped = rows_after_dedup - cleaned_rows if rows_dropped > 0: changes.append(f"Dropped {rows_dropped:,} rows with missing values") return { "message": "Data cleaned successfully", "original_rows": original_rows, "cleaned_rows": cleaned_rows, "rows_removed": original_rows - cleaned_rows, "changes": changes, "engine": "DuckDB" } finally: conn.close() @router.post("/validate-schema") async def validate_schema(file: UploadFile = File(...)): """Validate dataset schema using DuckDB""" try: conn, table_name = await read_to_duckdb(file) except Exception as e: raise HTTPException(status_code=400, detail=f"Could not read file: {str(e)}") try: row_count = conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0] columns_info = conn.execute(f"DESCRIBE {table_name}").fetchall() schema = [] for col in columns_info: col_name = col[0] col_type = col[1] # Check if nullable null_count = conn.execute(f''' SELECT COUNT(*) - COUNT("{col_name}") FROM {table_name} ''').fetchone()[0] # Get unique count unique_count = conn.execute(f''' SELECT COUNT(DISTINCT "{col_name}") FROM {table_name} ''').fetchone()[0] schema.append({ "column": col_name, "dtype": col_type, "nullable": null_count > 0, "null_count": int(null_count), "unique_values": int(unique_count) }) return { "valid": True, "row_count": row_count, "column_count": len(columns_info), "schema": schema, "engine": "DuckDB" } finally: conn.close() @router.post("/detect-outliers") async def detect_outliers(file: UploadFile = File(...)): """Detect outliers using DuckDB""" try: conn, table_name = await read_to_duckdb(file) except Exception as e: raise HTTPException(status_code=400, detail=f"Could not read file: {str(e)}") try: total_rows = conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0] columns_info = conn.execute(f"DESCRIBE {table_name}").fetchall() numeric_cols = [] outliers_by_column = {} total_outliers = 0 for col in columns_info: col_name = col[0] col_type = col[1].upper() # Check if numeric if any(t in col_type for t in ['INT', 'DOUBLE', 'FLOAT', 'DECIMAL', 'BIGINT', 'REAL']): numeric_cols.append(col_name) # Calculate IQR quartiles = conn.execute(f''' SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{col_name}") as q1, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{col_name}") as q3, MIN("{col_name}") as min_val, MAX("{col_name}") as max_val FROM {table_name} WHERE "{col_name}" IS NOT NULL ''').fetchone() if quartiles[0] is not None and quartiles[1] is not None: q1, q3 = float(quartiles[0]), float(quartiles[1]) iqr = q3 - q1 lower_bound = q1 - 1.5 * iqr upper_bound = q3 + 1.5 * iqr outlier_count = conn.execute(f''' SELECT COUNT(*) FROM {table_name} WHERE "{col_name}" IS NOT NULL AND ("{col_name}" < {lower_bound} OR "{col_name}" > {upper_bound}) ''').fetchone()[0] if outlier_count > 0: outliers_by_column[col_name] = { "count": int(outlier_count), "percent": round(outlier_count / total_rows * 100, 2), "lower_bound": round(lower_bound, 2), "upper_bound": round(upper_bound, 2), "q1": round(q1, 2), "q3": round(q3, 2), "iqr": round(iqr, 2), "min_value": round(float(quartiles[2]), 2) if quartiles[2] else None, "max_value": round(float(quartiles[3]), 2) if quartiles[3] else None } total_outliers += outlier_count return { "numeric_columns": numeric_cols, "outliers_by_column": outliers_by_column, "total_outliers": int(total_outliers), "total_rows": total_rows, "engine": "DuckDB" } finally: conn.close() @router.post("/profile") async def profile_data(file: UploadFile = File(...)): """Generate a comprehensive data profile using DuckDB SUMMARIZE""" try: conn, table_name = await read_to_duckdb(file) except Exception as e: raise HTTPException(status_code=400, detail=f"Could not read file: {str(e)}") try: # Use DuckDB's built-in SUMMARIZE - get columns and data without pandas desc = conn.execute(f"DESCRIBE (SUMMARIZE {table_name})").fetchall() columns = [col[0] for col in desc] rows = conn.execute(f"SUMMARIZE {table_name}").fetchall() # Get row count total_rows = conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0] # Convert to list of dicts profile = [dict(zip(columns, row)) for row in rows] return { "total_rows": total_rows, "total_columns": len(profile), "profile": profile, "engine": "DuckDB" } finally: conn.close()