525 lines
20 KiB
Python
525 lines
20 KiB
Python
"""Data Integrity Audit Router - Powered by DuckDB"""
|
|
from fastapi import APIRouter, UploadFile, File, HTTPException
|
|
from fastapi.responses import StreamingResponse
|
|
from pydantic import BaseModel
|
|
from typing import Optional
|
|
import duckdb
|
|
import io
|
|
import json
|
|
import tempfile
|
|
import os
|
|
|
|
router = APIRouter()
|
|
|
|
|
|
class ColumnStats(BaseModel):
|
|
name: str
|
|
dtype: str
|
|
missing_count: int
|
|
missing_percent: float
|
|
unique_count: int
|
|
sample_values: list
|
|
min_value: Optional[str] = None
|
|
max_value: Optional[str] = None
|
|
mean_value: Optional[float] = None
|
|
std_value: Optional[float] = None
|
|
|
|
|
|
class AuditResult(BaseModel):
|
|
total_rows: int
|
|
total_columns: int
|
|
missing_values: dict
|
|
duplicate_rows: int
|
|
duplicate_percent: float
|
|
column_stats: list[ColumnStats]
|
|
issues: list[str]
|
|
recommendations: list[str]
|
|
|
|
|
|
class CleaningConfig(BaseModel):
|
|
remove_duplicates: bool = True
|
|
fill_missing: Optional[str] = None # mean, median, mode, drop, value
|
|
fill_value: Optional[str] = None
|
|
remove_outliers: bool = False
|
|
outlier_method: str = "iqr" # iqr, zscore
|
|
outlier_threshold: float = 1.5
|
|
|
|
|
|
async def read_to_duckdb(file: UploadFile) -> tuple[duckdb.DuckDBPyConnection, str]:
|
|
"""Read uploaded file into DuckDB in-memory database"""
|
|
content = await file.read()
|
|
filename = file.filename.lower() if file.filename else "file.csv"
|
|
|
|
# Create in-memory DuckDB connection
|
|
conn = duckdb.connect(":memory:")
|
|
|
|
# Determine file suffix
|
|
if filename.endswith('.csv'):
|
|
suffix = '.csv'
|
|
elif filename.endswith('.json'):
|
|
suffix = '.json'
|
|
elif filename.endswith('.xlsx'):
|
|
suffix = '.xlsx'
|
|
elif filename.endswith('.xls'):
|
|
suffix = '.xls'
|
|
else:
|
|
suffix = '.csv'
|
|
|
|
with tempfile.NamedTemporaryFile(mode='wb', suffix=suffix, delete=False) as tmp:
|
|
tmp.write(content)
|
|
tmp_path = tmp.name
|
|
|
|
try:
|
|
if filename.endswith('.csv'):
|
|
conn.execute(f"CREATE TABLE data AS SELECT * FROM read_csv_auto('{tmp_path}')")
|
|
elif filename.endswith('.json'):
|
|
conn.execute(f"CREATE TABLE data AS SELECT * FROM read_json_auto('{tmp_path}')")
|
|
elif filename.endswith(('.xls', '.xlsx')):
|
|
# Use DuckDB's spatial extension for Excel or the xlsx reader
|
|
try:
|
|
# Try st_read first (requires spatial extension)
|
|
conn.execute(f"CREATE TABLE data AS SELECT * FROM st_read('{tmp_path}')")
|
|
except:
|
|
# Fallback to xlsx reader if available
|
|
conn.execute(f"CREATE TABLE data AS SELECT * FROM read_xlsx('{tmp_path}')")
|
|
else:
|
|
# Default to CSV
|
|
conn.execute(f"CREATE TABLE data AS SELECT * FROM read_csv_auto('{tmp_path}')")
|
|
finally:
|
|
os.unlink(tmp_path)
|
|
|
|
return conn, "data"
|
|
|
|
|
|
|
|
|
|
@router.post("/analyze")
|
|
async def analyze_data(file: UploadFile = File(...)):
|
|
"""Analyze a dataset for integrity issues using DuckDB"""
|
|
try:
|
|
conn, table_name = await read_to_duckdb(file)
|
|
except Exception as e:
|
|
raise HTTPException(status_code=400, detail=f"Could not read file: {str(e)}")
|
|
|
|
try:
|
|
# Get basic stats using DuckDB
|
|
total_rows = conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
|
|
|
|
# Get column info
|
|
columns_info = conn.execute(f"DESCRIBE {table_name}").fetchall()
|
|
column_names = [col[0] for col in columns_info]
|
|
column_types = {col[0]: col[1] for col in columns_info}
|
|
total_columns = len(column_names)
|
|
|
|
# Missing values analysis using DuckDB SQL
|
|
missing_values = {}
|
|
for col in column_names:
|
|
missing_count = conn.execute(f'''
|
|
SELECT COUNT(*) - COUNT("{col}") as missing FROM {table_name}
|
|
''').fetchone()[0]
|
|
if missing_count > 0:
|
|
missing_values[col] = {
|
|
"count": int(missing_count),
|
|
"percent": round(missing_count / total_rows * 100, 2)
|
|
}
|
|
|
|
# Duplicate rows using DuckDB
|
|
duplicate_query = f'''
|
|
SELECT COUNT(*) as dup_count FROM (
|
|
SELECT *, COUNT(*) OVER (PARTITION BY {', '.join([f'"{c}"' for c in column_names])}) as cnt
|
|
FROM {table_name}
|
|
) WHERE cnt > 1
|
|
'''
|
|
try:
|
|
duplicate_rows = conn.execute(duplicate_query).fetchone()[0]
|
|
except:
|
|
# Fallback for complex cases
|
|
duplicate_rows = 0
|
|
duplicate_percent = round(duplicate_rows / total_rows * 100, 2) if total_rows > 0 else 0
|
|
|
|
# Column statistics using DuckDB
|
|
column_stats = []
|
|
for col in column_names:
|
|
col_type = column_types[col]
|
|
|
|
# Get missing count
|
|
missing_count = conn.execute(f'''
|
|
SELECT COUNT(*) - COUNT("{col}") FROM {table_name}
|
|
''').fetchone()[0]
|
|
missing_percent = round(missing_count / total_rows * 100, 2) if total_rows > 0 else 0
|
|
|
|
# Get unique count
|
|
unique_count = conn.execute(f'''
|
|
SELECT COUNT(DISTINCT "{col}") FROM {table_name}
|
|
''').fetchone()[0]
|
|
|
|
# Get sample values
|
|
samples = conn.execute(f'''
|
|
SELECT DISTINCT "{col}" FROM {table_name}
|
|
WHERE "{col}" IS NOT NULL
|
|
LIMIT 5
|
|
''').fetchall()
|
|
sample_values = [str(s[0]) for s in samples]
|
|
|
|
# Get min/max/mean/std for numeric columns
|
|
min_val, max_val, mean_val, std_val = None, None, None, None
|
|
if 'INT' in col_type.upper() or 'DOUBLE' in col_type.upper() or 'FLOAT' in col_type.upper() or 'DECIMAL' in col_type.upper() or 'BIGINT' in col_type.upper():
|
|
stats = conn.execute(f'''
|
|
SELECT
|
|
MIN("{col}"),
|
|
MAX("{col}"),
|
|
AVG("{col}"),
|
|
STDDEV("{col}")
|
|
FROM {table_name}
|
|
''').fetchone()
|
|
min_val = str(stats[0]) if stats[0] is not None else None
|
|
max_val = str(stats[1]) if stats[1] is not None else None
|
|
mean_val = round(float(stats[2]), 4) if stats[2] is not None else None
|
|
std_val = round(float(stats[3]), 4) if stats[3] is not None else None
|
|
|
|
column_stats.append(ColumnStats(
|
|
name=col,
|
|
dtype=col_type,
|
|
missing_count=int(missing_count),
|
|
missing_percent=missing_percent,
|
|
unique_count=int(unique_count),
|
|
sample_values=sample_values,
|
|
min_value=min_val,
|
|
max_value=max_val,
|
|
mean_value=mean_val,
|
|
std_value=std_val
|
|
))
|
|
|
|
# Generate issues and recommendations
|
|
issues = []
|
|
recommendations = []
|
|
|
|
# Check for missing values
|
|
total_missing = sum(mv["count"] for mv in missing_values.values())
|
|
if total_missing > 0:
|
|
issues.append(f"Dataset has {total_missing:,} missing values across {len(missing_values)} columns")
|
|
recommendations.append("Consider filling missing values with mean/median for numeric columns or mode for categorical")
|
|
|
|
# Check for duplicates
|
|
if duplicate_rows > 0:
|
|
issues.append(f"Found {duplicate_rows:,} duplicate rows ({duplicate_percent}%)")
|
|
recommendations.append("Consider removing duplicate rows to improve data quality")
|
|
|
|
# Check for high cardinality columns
|
|
for col in column_names:
|
|
unique_count = conn.execute(f'SELECT COUNT(DISTINCT "{col}") FROM {table_name}').fetchone()[0]
|
|
unique_ratio = unique_count / total_rows if total_rows > 0 else 0
|
|
col_type = column_types[col]
|
|
if unique_ratio > 0.9 and 'VARCHAR' in col_type.upper():
|
|
issues.append(f"Column '{col}' has very high cardinality ({unique_count:,} unique values)")
|
|
recommendations.append(f"Review if '{col}' should be used as an identifier rather than a feature")
|
|
|
|
# Check for constant columns
|
|
for col in column_names:
|
|
unique_count = conn.execute(f'SELECT COUNT(DISTINCT "{col}") FROM {table_name}').fetchone()[0]
|
|
if unique_count == 1:
|
|
issues.append(f"Column '{col}' has only one unique value")
|
|
recommendations.append(f"Consider removing constant column '{col}'")
|
|
|
|
# Check for outliers in numeric columns using DuckDB
|
|
outlier_columns = []
|
|
total_outlier_count = 0
|
|
for col in column_names:
|
|
col_type = column_types[col]
|
|
if 'INT' in col_type.upper() or 'DOUBLE' in col_type.upper() or 'FLOAT' in col_type.upper() or 'DECIMAL' in col_type.upper() or 'BIGINT' in col_type.upper():
|
|
# Calculate IQR using DuckDB
|
|
quartiles = conn.execute(f'''
|
|
SELECT
|
|
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{col}") as q1,
|
|
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{col}") as q3
|
|
FROM {table_name}
|
|
WHERE "{col}" IS NOT NULL
|
|
''').fetchone()
|
|
|
|
if quartiles[0] is not None and quartiles[1] is not None:
|
|
q1, q3 = float(quartiles[0]), float(quartiles[1])
|
|
iqr = q3 - q1
|
|
lower_bound = q1 - 1.5 * iqr
|
|
upper_bound = q3 + 1.5 * iqr
|
|
|
|
outlier_count = conn.execute(f'''
|
|
SELECT COUNT(*) FROM {table_name}
|
|
WHERE "{col}" < {lower_bound} OR "{col}" > {upper_bound}
|
|
''').fetchone()[0]
|
|
|
|
if outlier_count > 0:
|
|
outlier_pct = round(outlier_count / total_rows * 100, 1)
|
|
issues.append(f"Column '{col}' has {outlier_count:,} potential outliers ({outlier_pct}%)")
|
|
outlier_columns.append(col)
|
|
total_outlier_count += outlier_count
|
|
|
|
# Add outlier recommendations
|
|
if outlier_columns:
|
|
if total_outlier_count > total_rows * 0.1:
|
|
recommendations.append(f"High outlier rate detected. Review data collection process for columns: {', '.join(outlier_columns[:5])}")
|
|
recommendations.append("Consider using robust scalers (RobustScaler) or winsorization for outlier-heavy columns")
|
|
if len(outlier_columns) > 3:
|
|
recommendations.append(f"Multiple columns ({len(outlier_columns)}) have outliers - consider domain-specific thresholds instead of IQR")
|
|
|
|
if not issues:
|
|
issues.append("No major data quality issues detected")
|
|
recommendations.append("Dataset appears to be clean")
|
|
|
|
return {
|
|
"total_rows": total_rows,
|
|
"total_columns": total_columns,
|
|
"missing_values": missing_values,
|
|
"duplicate_rows": int(duplicate_rows),
|
|
"duplicate_percent": duplicate_percent,
|
|
"column_stats": [cs.model_dump() for cs in column_stats],
|
|
"issues": issues,
|
|
"recommendations": recommendations,
|
|
"engine": "DuckDB" # Indicate we're using DuckDB
|
|
}
|
|
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
@router.post("/analyze-duckdb")
|
|
async def analyze_with_sql(file: UploadFile = File(...), query: Optional[str] = None):
|
|
"""Run custom SQL analysis on uploaded data using DuckDB"""
|
|
try:
|
|
conn, table_name = await read_to_duckdb(file)
|
|
except Exception as e:
|
|
raise HTTPException(status_code=400, detail=f"Could not read file: {str(e)}")
|
|
|
|
try:
|
|
if query:
|
|
# Run custom query (replace 'data' with actual table name)
|
|
safe_query = query.replace("FROM data", f"FROM {table_name}").replace("from data", f"FROM {table_name}")
|
|
# Get column names from description
|
|
desc = conn.execute(f"DESCRIBE ({safe_query})").fetchall()
|
|
columns = [col[0] for col in desc]
|
|
# Fetch data as list of tuples
|
|
rows = conn.execute(safe_query).fetchall()
|
|
# Convert to list of dicts
|
|
data = [dict(zip(columns, row)) for row in rows]
|
|
return {
|
|
"columns": columns,
|
|
"data": data,
|
|
"row_count": len(rows)
|
|
}
|
|
else:
|
|
# Return summary using DuckDB SUMMARIZE
|
|
desc = conn.execute(f"DESCRIBE (SUMMARIZE {table_name})").fetchall()
|
|
columns = [col[0] for col in desc]
|
|
rows = conn.execute(f"SUMMARIZE {table_name}").fetchall()
|
|
data = [dict(zip(columns, row)) for row in rows]
|
|
return {
|
|
"columns": columns,
|
|
"data": data,
|
|
"row_count": len(rows)
|
|
}
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
@router.post("/clean")
|
|
async def clean_data(file: UploadFile = File(...)):
|
|
"""Clean a dataset using DuckDB"""
|
|
try:
|
|
conn, table_name = await read_to_duckdb(file)
|
|
except Exception as e:
|
|
raise HTTPException(status_code=400, detail=f"Could not read file: {str(e)}")
|
|
|
|
try:
|
|
original_rows = conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
|
|
changes = []
|
|
|
|
# Get column names
|
|
columns_info = conn.execute(f"DESCRIBE {table_name}").fetchall()
|
|
column_names = [col[0] for col in columns_info]
|
|
|
|
# Remove duplicates using DuckDB
|
|
conn.execute(f'''
|
|
CREATE TABLE cleaned AS
|
|
SELECT DISTINCT * FROM {table_name}
|
|
''')
|
|
|
|
rows_after_dedup = conn.execute("SELECT COUNT(*) FROM cleaned").fetchone()[0]
|
|
duplicates_removed = original_rows - rows_after_dedup
|
|
if duplicates_removed > 0:
|
|
changes.append(f"Removed {duplicates_removed:,} duplicate rows")
|
|
|
|
# Count rows with any NULL values
|
|
null_conditions = " OR ".join([f'"{col}" IS NULL' for col in column_names])
|
|
rows_with_nulls = conn.execute(f'''
|
|
SELECT COUNT(*) FROM cleaned WHERE {null_conditions}
|
|
''').fetchone()[0]
|
|
|
|
# Remove rows with NULL values
|
|
not_null_conditions = " AND ".join([f'"{col}" IS NOT NULL' for col in column_names])
|
|
conn.execute(f'''
|
|
CREATE TABLE final_cleaned AS
|
|
SELECT * FROM cleaned WHERE {not_null_conditions}
|
|
''')
|
|
|
|
cleaned_rows = conn.execute("SELECT COUNT(*) FROM final_cleaned").fetchone()[0]
|
|
rows_dropped = rows_after_dedup - cleaned_rows
|
|
if rows_dropped > 0:
|
|
changes.append(f"Dropped {rows_dropped:,} rows with missing values")
|
|
|
|
return {
|
|
"message": "Data cleaned successfully",
|
|
"original_rows": original_rows,
|
|
"cleaned_rows": cleaned_rows,
|
|
"rows_removed": original_rows - cleaned_rows,
|
|
"changes": changes,
|
|
"engine": "DuckDB"
|
|
}
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
@router.post("/validate-schema")
|
|
async def validate_schema(file: UploadFile = File(...)):
|
|
"""Validate dataset schema using DuckDB"""
|
|
try:
|
|
conn, table_name = await read_to_duckdb(file)
|
|
except Exception as e:
|
|
raise HTTPException(status_code=400, detail=f"Could not read file: {str(e)}")
|
|
|
|
try:
|
|
row_count = conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
|
|
columns_info = conn.execute(f"DESCRIBE {table_name}").fetchall()
|
|
|
|
schema = []
|
|
for col in columns_info:
|
|
col_name = col[0]
|
|
col_type = col[1]
|
|
|
|
# Check if nullable
|
|
null_count = conn.execute(f'''
|
|
SELECT COUNT(*) - COUNT("{col_name}") FROM {table_name}
|
|
''').fetchone()[0]
|
|
|
|
# Get unique count
|
|
unique_count = conn.execute(f'''
|
|
SELECT COUNT(DISTINCT "{col_name}") FROM {table_name}
|
|
''').fetchone()[0]
|
|
|
|
schema.append({
|
|
"column": col_name,
|
|
"dtype": col_type,
|
|
"nullable": null_count > 0,
|
|
"null_count": int(null_count),
|
|
"unique_values": int(unique_count)
|
|
})
|
|
|
|
return {
|
|
"valid": True,
|
|
"row_count": row_count,
|
|
"column_count": len(columns_info),
|
|
"schema": schema,
|
|
"engine": "DuckDB"
|
|
}
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
@router.post("/detect-outliers")
|
|
async def detect_outliers(file: UploadFile = File(...)):
|
|
"""Detect outliers using DuckDB"""
|
|
try:
|
|
conn, table_name = await read_to_duckdb(file)
|
|
except Exception as e:
|
|
raise HTTPException(status_code=400, detail=f"Could not read file: {str(e)}")
|
|
|
|
try:
|
|
total_rows = conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
|
|
columns_info = conn.execute(f"DESCRIBE {table_name}").fetchall()
|
|
|
|
numeric_cols = []
|
|
outliers_by_column = {}
|
|
total_outliers = 0
|
|
|
|
for col in columns_info:
|
|
col_name = col[0]
|
|
col_type = col[1].upper()
|
|
|
|
# Check if numeric
|
|
if any(t in col_type for t in ['INT', 'DOUBLE', 'FLOAT', 'DECIMAL', 'BIGINT', 'REAL']):
|
|
numeric_cols.append(col_name)
|
|
|
|
# Calculate IQR
|
|
quartiles = conn.execute(f'''
|
|
SELECT
|
|
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{col_name}") as q1,
|
|
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{col_name}") as q3,
|
|
MIN("{col_name}") as min_val,
|
|
MAX("{col_name}") as max_val
|
|
FROM {table_name}
|
|
WHERE "{col_name}" IS NOT NULL
|
|
''').fetchone()
|
|
|
|
if quartiles[0] is not None and quartiles[1] is not None:
|
|
q1, q3 = float(quartiles[0]), float(quartiles[1])
|
|
iqr = q3 - q1
|
|
lower_bound = q1 - 1.5 * iqr
|
|
upper_bound = q3 + 1.5 * iqr
|
|
|
|
outlier_count = conn.execute(f'''
|
|
SELECT COUNT(*) FROM {table_name}
|
|
WHERE "{col_name}" IS NOT NULL
|
|
AND ("{col_name}" < {lower_bound} OR "{col_name}" > {upper_bound})
|
|
''').fetchone()[0]
|
|
|
|
if outlier_count > 0:
|
|
outliers_by_column[col_name] = {
|
|
"count": int(outlier_count),
|
|
"percent": round(outlier_count / total_rows * 100, 2),
|
|
"lower_bound": round(lower_bound, 2),
|
|
"upper_bound": round(upper_bound, 2),
|
|
"q1": round(q1, 2),
|
|
"q3": round(q3, 2),
|
|
"iqr": round(iqr, 2),
|
|
"min_value": round(float(quartiles[2]), 2) if quartiles[2] else None,
|
|
"max_value": round(float(quartiles[3]), 2) if quartiles[3] else None
|
|
}
|
|
total_outliers += outlier_count
|
|
|
|
return {
|
|
"numeric_columns": numeric_cols,
|
|
"outliers_by_column": outliers_by_column,
|
|
"total_outliers": int(total_outliers),
|
|
"total_rows": total_rows,
|
|
"engine": "DuckDB"
|
|
}
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
@router.post("/profile")
|
|
async def profile_data(file: UploadFile = File(...)):
|
|
"""Generate a comprehensive data profile using DuckDB SUMMARIZE"""
|
|
try:
|
|
conn, table_name = await read_to_duckdb(file)
|
|
except Exception as e:
|
|
raise HTTPException(status_code=400, detail=f"Could not read file: {str(e)}")
|
|
|
|
try:
|
|
# Use DuckDB's built-in SUMMARIZE - get columns and data without pandas
|
|
desc = conn.execute(f"DESCRIBE (SUMMARIZE {table_name})").fetchall()
|
|
columns = [col[0] for col in desc]
|
|
rows = conn.execute(f"SUMMARIZE {table_name}").fetchall()
|
|
|
|
# Get row count
|
|
total_rows = conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
|
|
|
|
# Convert to list of dicts
|
|
profile = [dict(zip(columns, row)) for row in rows]
|
|
|
|
return {
|
|
"total_rows": total_rows,
|
|
"total_columns": len(profile),
|
|
"profile": profile,
|
|
"engine": "DuckDB"
|
|
}
|
|
finally:
|
|
conn.close()
|