ai-tools-suite/backend/routers/audit.py
2025-12-27 15:33:06 +00:00

525 lines
20 KiB
Python

"""Data Integrity Audit Router - Powered by DuckDB"""
from fastapi import APIRouter, UploadFile, File, HTTPException
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
from typing import Optional
import duckdb
import io
import json
import tempfile
import os
router = APIRouter()
class ColumnStats(BaseModel):
name: str
dtype: str
missing_count: int
missing_percent: float
unique_count: int
sample_values: list
min_value: Optional[str] = None
max_value: Optional[str] = None
mean_value: Optional[float] = None
std_value: Optional[float] = None
class AuditResult(BaseModel):
total_rows: int
total_columns: int
missing_values: dict
duplicate_rows: int
duplicate_percent: float
column_stats: list[ColumnStats]
issues: list[str]
recommendations: list[str]
class CleaningConfig(BaseModel):
remove_duplicates: bool = True
fill_missing: Optional[str] = None # mean, median, mode, drop, value
fill_value: Optional[str] = None
remove_outliers: bool = False
outlier_method: str = "iqr" # iqr, zscore
outlier_threshold: float = 1.5
async def read_to_duckdb(file: UploadFile) -> tuple[duckdb.DuckDBPyConnection, str]:
"""Read uploaded file into DuckDB in-memory database"""
content = await file.read()
filename = file.filename.lower() if file.filename else "file.csv"
# Create in-memory DuckDB connection
conn = duckdb.connect(":memory:")
# Determine file suffix
if filename.endswith('.csv'):
suffix = '.csv'
elif filename.endswith('.json'):
suffix = '.json'
elif filename.endswith('.xlsx'):
suffix = '.xlsx'
elif filename.endswith('.xls'):
suffix = '.xls'
else:
suffix = '.csv'
with tempfile.NamedTemporaryFile(mode='wb', suffix=suffix, delete=False) as tmp:
tmp.write(content)
tmp_path = tmp.name
try:
if filename.endswith('.csv'):
conn.execute(f"CREATE TABLE data AS SELECT * FROM read_csv_auto('{tmp_path}')")
elif filename.endswith('.json'):
conn.execute(f"CREATE TABLE data AS SELECT * FROM read_json_auto('{tmp_path}')")
elif filename.endswith(('.xls', '.xlsx')):
# Use DuckDB's spatial extension for Excel or the xlsx reader
try:
# Try st_read first (requires spatial extension)
conn.execute(f"CREATE TABLE data AS SELECT * FROM st_read('{tmp_path}')")
except:
# Fallback to xlsx reader if available
conn.execute(f"CREATE TABLE data AS SELECT * FROM read_xlsx('{tmp_path}')")
else:
# Default to CSV
conn.execute(f"CREATE TABLE data AS SELECT * FROM read_csv_auto('{tmp_path}')")
finally:
os.unlink(tmp_path)
return conn, "data"
@router.post("/analyze")
async def analyze_data(file: UploadFile = File(...)):
"""Analyze a dataset for integrity issues using DuckDB"""
try:
conn, table_name = await read_to_duckdb(file)
except Exception as e:
raise HTTPException(status_code=400, detail=f"Could not read file: {str(e)}")
try:
# Get basic stats using DuckDB
total_rows = conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
# Get column info
columns_info = conn.execute(f"DESCRIBE {table_name}").fetchall()
column_names = [col[0] for col in columns_info]
column_types = {col[0]: col[1] for col in columns_info}
total_columns = len(column_names)
# Missing values analysis using DuckDB SQL
missing_values = {}
for col in column_names:
missing_count = conn.execute(f'''
SELECT COUNT(*) - COUNT("{col}") as missing FROM {table_name}
''').fetchone()[0]
if missing_count > 0:
missing_values[col] = {
"count": int(missing_count),
"percent": round(missing_count / total_rows * 100, 2)
}
# Duplicate rows using DuckDB
duplicate_query = f'''
SELECT COUNT(*) as dup_count FROM (
SELECT *, COUNT(*) OVER (PARTITION BY {', '.join([f'"{c}"' for c in column_names])}) as cnt
FROM {table_name}
) WHERE cnt > 1
'''
try:
duplicate_rows = conn.execute(duplicate_query).fetchone()[0]
except:
# Fallback for complex cases
duplicate_rows = 0
duplicate_percent = round(duplicate_rows / total_rows * 100, 2) if total_rows > 0 else 0
# Column statistics using DuckDB
column_stats = []
for col in column_names:
col_type = column_types[col]
# Get missing count
missing_count = conn.execute(f'''
SELECT COUNT(*) - COUNT("{col}") FROM {table_name}
''').fetchone()[0]
missing_percent = round(missing_count / total_rows * 100, 2) if total_rows > 0 else 0
# Get unique count
unique_count = conn.execute(f'''
SELECT COUNT(DISTINCT "{col}") FROM {table_name}
''').fetchone()[0]
# Get sample values
samples = conn.execute(f'''
SELECT DISTINCT "{col}" FROM {table_name}
WHERE "{col}" IS NOT NULL
LIMIT 5
''').fetchall()
sample_values = [str(s[0]) for s in samples]
# Get min/max/mean/std for numeric columns
min_val, max_val, mean_val, std_val = None, None, None, None
if 'INT' in col_type.upper() or 'DOUBLE' in col_type.upper() or 'FLOAT' in col_type.upper() or 'DECIMAL' in col_type.upper() or 'BIGINT' in col_type.upper():
stats = conn.execute(f'''
SELECT
MIN("{col}"),
MAX("{col}"),
AVG("{col}"),
STDDEV("{col}")
FROM {table_name}
''').fetchone()
min_val = str(stats[0]) if stats[0] is not None else None
max_val = str(stats[1]) if stats[1] is not None else None
mean_val = round(float(stats[2]), 4) if stats[2] is not None else None
std_val = round(float(stats[3]), 4) if stats[3] is not None else None
column_stats.append(ColumnStats(
name=col,
dtype=col_type,
missing_count=int(missing_count),
missing_percent=missing_percent,
unique_count=int(unique_count),
sample_values=sample_values,
min_value=min_val,
max_value=max_val,
mean_value=mean_val,
std_value=std_val
))
# Generate issues and recommendations
issues = []
recommendations = []
# Check for missing values
total_missing = sum(mv["count"] for mv in missing_values.values())
if total_missing > 0:
issues.append(f"Dataset has {total_missing:,} missing values across {len(missing_values)} columns")
recommendations.append("Consider filling missing values with mean/median for numeric columns or mode for categorical")
# Check for duplicates
if duplicate_rows > 0:
issues.append(f"Found {duplicate_rows:,} duplicate rows ({duplicate_percent}%)")
recommendations.append("Consider removing duplicate rows to improve data quality")
# Check for high cardinality columns
for col in column_names:
unique_count = conn.execute(f'SELECT COUNT(DISTINCT "{col}") FROM {table_name}').fetchone()[0]
unique_ratio = unique_count / total_rows if total_rows > 0 else 0
col_type = column_types[col]
if unique_ratio > 0.9 and 'VARCHAR' in col_type.upper():
issues.append(f"Column '{col}' has very high cardinality ({unique_count:,} unique values)")
recommendations.append(f"Review if '{col}' should be used as an identifier rather than a feature")
# Check for constant columns
for col in column_names:
unique_count = conn.execute(f'SELECT COUNT(DISTINCT "{col}") FROM {table_name}').fetchone()[0]
if unique_count == 1:
issues.append(f"Column '{col}' has only one unique value")
recommendations.append(f"Consider removing constant column '{col}'")
# Check for outliers in numeric columns using DuckDB
outlier_columns = []
total_outlier_count = 0
for col in column_names:
col_type = column_types[col]
if 'INT' in col_type.upper() or 'DOUBLE' in col_type.upper() or 'FLOAT' in col_type.upper() or 'DECIMAL' in col_type.upper() or 'BIGINT' in col_type.upper():
# Calculate IQR using DuckDB
quartiles = conn.execute(f'''
SELECT
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{col}") as q1,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{col}") as q3
FROM {table_name}
WHERE "{col}" IS NOT NULL
''').fetchone()
if quartiles[0] is not None and quartiles[1] is not None:
q1, q3 = float(quartiles[0]), float(quartiles[1])
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outlier_count = conn.execute(f'''
SELECT COUNT(*) FROM {table_name}
WHERE "{col}" < {lower_bound} OR "{col}" > {upper_bound}
''').fetchone()[0]
if outlier_count > 0:
outlier_pct = round(outlier_count / total_rows * 100, 1)
issues.append(f"Column '{col}' has {outlier_count:,} potential outliers ({outlier_pct}%)")
outlier_columns.append(col)
total_outlier_count += outlier_count
# Add outlier recommendations
if outlier_columns:
if total_outlier_count > total_rows * 0.1:
recommendations.append(f"High outlier rate detected. Review data collection process for columns: {', '.join(outlier_columns[:5])}")
recommendations.append("Consider using robust scalers (RobustScaler) or winsorization for outlier-heavy columns")
if len(outlier_columns) > 3:
recommendations.append(f"Multiple columns ({len(outlier_columns)}) have outliers - consider domain-specific thresholds instead of IQR")
if not issues:
issues.append("No major data quality issues detected")
recommendations.append("Dataset appears to be clean")
return {
"total_rows": total_rows,
"total_columns": total_columns,
"missing_values": missing_values,
"duplicate_rows": int(duplicate_rows),
"duplicate_percent": duplicate_percent,
"column_stats": [cs.model_dump() for cs in column_stats],
"issues": issues,
"recommendations": recommendations,
"engine": "DuckDB" # Indicate we're using DuckDB
}
finally:
conn.close()
@router.post("/analyze-duckdb")
async def analyze_with_sql(file: UploadFile = File(...), query: Optional[str] = None):
"""Run custom SQL analysis on uploaded data using DuckDB"""
try:
conn, table_name = await read_to_duckdb(file)
except Exception as e:
raise HTTPException(status_code=400, detail=f"Could not read file: {str(e)}")
try:
if query:
# Run custom query (replace 'data' with actual table name)
safe_query = query.replace("FROM data", f"FROM {table_name}").replace("from data", f"FROM {table_name}")
# Get column names from description
desc = conn.execute(f"DESCRIBE ({safe_query})").fetchall()
columns = [col[0] for col in desc]
# Fetch data as list of tuples
rows = conn.execute(safe_query).fetchall()
# Convert to list of dicts
data = [dict(zip(columns, row)) for row in rows]
return {
"columns": columns,
"data": data,
"row_count": len(rows)
}
else:
# Return summary using DuckDB SUMMARIZE
desc = conn.execute(f"DESCRIBE (SUMMARIZE {table_name})").fetchall()
columns = [col[0] for col in desc]
rows = conn.execute(f"SUMMARIZE {table_name}").fetchall()
data = [dict(zip(columns, row)) for row in rows]
return {
"columns": columns,
"data": data,
"row_count": len(rows)
}
finally:
conn.close()
@router.post("/clean")
async def clean_data(file: UploadFile = File(...)):
"""Clean a dataset using DuckDB"""
try:
conn, table_name = await read_to_duckdb(file)
except Exception as e:
raise HTTPException(status_code=400, detail=f"Could not read file: {str(e)}")
try:
original_rows = conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
changes = []
# Get column names
columns_info = conn.execute(f"DESCRIBE {table_name}").fetchall()
column_names = [col[0] for col in columns_info]
# Remove duplicates using DuckDB
conn.execute(f'''
CREATE TABLE cleaned AS
SELECT DISTINCT * FROM {table_name}
''')
rows_after_dedup = conn.execute("SELECT COUNT(*) FROM cleaned").fetchone()[0]
duplicates_removed = original_rows - rows_after_dedup
if duplicates_removed > 0:
changes.append(f"Removed {duplicates_removed:,} duplicate rows")
# Count rows with any NULL values
null_conditions = " OR ".join([f'"{col}" IS NULL' for col in column_names])
rows_with_nulls = conn.execute(f'''
SELECT COUNT(*) FROM cleaned WHERE {null_conditions}
''').fetchone()[0]
# Remove rows with NULL values
not_null_conditions = " AND ".join([f'"{col}" IS NOT NULL' for col in column_names])
conn.execute(f'''
CREATE TABLE final_cleaned AS
SELECT * FROM cleaned WHERE {not_null_conditions}
''')
cleaned_rows = conn.execute("SELECT COUNT(*) FROM final_cleaned").fetchone()[0]
rows_dropped = rows_after_dedup - cleaned_rows
if rows_dropped > 0:
changes.append(f"Dropped {rows_dropped:,} rows with missing values")
return {
"message": "Data cleaned successfully",
"original_rows": original_rows,
"cleaned_rows": cleaned_rows,
"rows_removed": original_rows - cleaned_rows,
"changes": changes,
"engine": "DuckDB"
}
finally:
conn.close()
@router.post("/validate-schema")
async def validate_schema(file: UploadFile = File(...)):
"""Validate dataset schema using DuckDB"""
try:
conn, table_name = await read_to_duckdb(file)
except Exception as e:
raise HTTPException(status_code=400, detail=f"Could not read file: {str(e)}")
try:
row_count = conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
columns_info = conn.execute(f"DESCRIBE {table_name}").fetchall()
schema = []
for col in columns_info:
col_name = col[0]
col_type = col[1]
# Check if nullable
null_count = conn.execute(f'''
SELECT COUNT(*) - COUNT("{col_name}") FROM {table_name}
''').fetchone()[0]
# Get unique count
unique_count = conn.execute(f'''
SELECT COUNT(DISTINCT "{col_name}") FROM {table_name}
''').fetchone()[0]
schema.append({
"column": col_name,
"dtype": col_type,
"nullable": null_count > 0,
"null_count": int(null_count),
"unique_values": int(unique_count)
})
return {
"valid": True,
"row_count": row_count,
"column_count": len(columns_info),
"schema": schema,
"engine": "DuckDB"
}
finally:
conn.close()
@router.post("/detect-outliers")
async def detect_outliers(file: UploadFile = File(...)):
"""Detect outliers using DuckDB"""
try:
conn, table_name = await read_to_duckdb(file)
except Exception as e:
raise HTTPException(status_code=400, detail=f"Could not read file: {str(e)}")
try:
total_rows = conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
columns_info = conn.execute(f"DESCRIBE {table_name}").fetchall()
numeric_cols = []
outliers_by_column = {}
total_outliers = 0
for col in columns_info:
col_name = col[0]
col_type = col[1].upper()
# Check if numeric
if any(t in col_type for t in ['INT', 'DOUBLE', 'FLOAT', 'DECIMAL', 'BIGINT', 'REAL']):
numeric_cols.append(col_name)
# Calculate IQR
quartiles = conn.execute(f'''
SELECT
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{col_name}") as q1,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{col_name}") as q3,
MIN("{col_name}") as min_val,
MAX("{col_name}") as max_val
FROM {table_name}
WHERE "{col_name}" IS NOT NULL
''').fetchone()
if quartiles[0] is not None and quartiles[1] is not None:
q1, q3 = float(quartiles[0]), float(quartiles[1])
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outlier_count = conn.execute(f'''
SELECT COUNT(*) FROM {table_name}
WHERE "{col_name}" IS NOT NULL
AND ("{col_name}" < {lower_bound} OR "{col_name}" > {upper_bound})
''').fetchone()[0]
if outlier_count > 0:
outliers_by_column[col_name] = {
"count": int(outlier_count),
"percent": round(outlier_count / total_rows * 100, 2),
"lower_bound": round(lower_bound, 2),
"upper_bound": round(upper_bound, 2),
"q1": round(q1, 2),
"q3": round(q3, 2),
"iqr": round(iqr, 2),
"min_value": round(float(quartiles[2]), 2) if quartiles[2] else None,
"max_value": round(float(quartiles[3]), 2) if quartiles[3] else None
}
total_outliers += outlier_count
return {
"numeric_columns": numeric_cols,
"outliers_by_column": outliers_by_column,
"total_outliers": int(total_outliers),
"total_rows": total_rows,
"engine": "DuckDB"
}
finally:
conn.close()
@router.post("/profile")
async def profile_data(file: UploadFile = File(...)):
"""Generate a comprehensive data profile using DuckDB SUMMARIZE"""
try:
conn, table_name = await read_to_duckdb(file)
except Exception as e:
raise HTTPException(status_code=400, detail=f"Could not read file: {str(e)}")
try:
# Use DuckDB's built-in SUMMARIZE - get columns and data without pandas
desc = conn.execute(f"DESCRIBE (SUMMARIZE {table_name})").fetchall()
columns = [col[0] for col in desc]
rows = conn.execute(f"SUMMARIZE {table_name}").fetchall()
# Get row count
total_rows = conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
# Convert to list of dicts
profile = [dict(zip(columns, row)) for row in rows]
return {
"total_rows": total_rows,
"total_columns": len(profile),
"profile": profile,
"engine": "DuckDB"
}
finally:
conn.close()