ai-tools-suite/backend/routers/house_predictor.py
2025-12-27 15:33:06 +00:00

386 lines
12 KiB
Python

"""
House Price Predictor API
Seattle/King County house price prediction and visualization
Using DuckDB for data operations
"""
from fastapi import APIRouter, Query, HTTPException
from pydantic import BaseModel
from typing import Optional
import duckdb
import pandas as pd
import numpy as np
import joblib
from pathlib import Path
from datetime import datetime
router = APIRouter()
# Paths
DATA_PATH = Path(__file__).parent.parent / "data" / "kc_house_data.csv"
MODEL_PATH = Path(__file__).parent.parent / "data" / "house_price_model.joblib"
# DuckDB connection and model cache
_conn: Optional[duckdb.DuckDBPyConnection] = None
_model = None
_current_year = datetime.now().year
def get_conn() -> duckdb.DuckDBPyConnection:
"""Get or create DuckDB connection with house data"""
global _conn
if _conn is None:
_conn = duckdb.connect(':memory:')
# Load CSV and create table with calculated age column
_conn.execute(f"""
CREATE TABLE houses AS
SELECT
*,
{_current_year} - yr_built AS age,
sqft_living AS sqft
FROM read_csv_auto('{DATA_PATH}')
""")
return _conn
def get_model():
"""Load and cache the prediction model"""
global _model
if _model is None:
import warnings
with warnings.catch_warnings():
warnings.simplefilter("ignore")
_model = joblib.load(MODEL_PATH)
return _model
class PredictionRequest(BaseModel):
bedrooms: int
bathrooms: float
sqft: int
age: int
class PredictionResponse(BaseModel):
predicted_price: float
formatted_price: str
@router.get("/metadata")
async def get_metadata():
"""Get metadata about the house dataset"""
conn = get_conn()
# Get price stats
price_stats = conn.execute("""
SELECT
MIN(price) as min_price,
MAX(price) as max_price,
AVG(price) as mean_price,
MEDIAN(price) as median_price
FROM houses
""").fetchone()
# Get feature ranges
feature_stats = conn.execute("""
SELECT
MIN(bedrooms) as min_bed, MAX(bedrooms) as max_bed,
MIN(bathrooms) as min_bath, MAX(bathrooms) as max_bath,
MIN(sqft_living) as min_sqft, MAX(sqft_living) as max_sqft,
MIN(age) as min_age, MAX(age) as max_age
FROM houses
""").fetchone()
# Get location bounds
location_stats = conn.execute("""
SELECT
MIN(lat) as min_lat, MAX(lat) as max_lat,
MIN(long) as min_long, MAX(long) as max_long,
AVG(lat) as center_lat, AVG(long) as center_long
FROM houses
""").fetchone()
# Get zipcodes
zipcodes = conn.execute("SELECT DISTINCT zipcode FROM houses ORDER BY zipcode").fetchall()
# Get total count
total = conn.execute("SELECT COUNT(*) FROM houses").fetchone()[0]
return {
"total_records": total,
"price_range": {
"min": float(price_stats[0]),
"max": float(price_stats[1]),
"mean": float(price_stats[2]),
"median": float(price_stats[3])
},
"features": {
"bedrooms": {"min": int(feature_stats[0]), "max": int(feature_stats[1])},
"bathrooms": {"min": float(feature_stats[2]), "max": float(feature_stats[3])},
"sqft_living": {"min": int(feature_stats[4]), "max": int(feature_stats[5])},
"age": {"min": int(feature_stats[6]), "max": int(feature_stats[7])}
},
"location": {
"lat_range": [float(location_stats[0]), float(location_stats[1])],
"long_range": [float(location_stats[2]), float(location_stats[3])],
"center": [float(location_stats[4]), float(location_stats[5])]
},
"zipcodes": [z[0] for z in zipcodes],
"data_period": "2014-2015",
"region": "King County, Washington"
}
@router.get("/data")
async def get_house_data(
min_price: Optional[float] = Query(None, description="Minimum price filter"),
max_price: Optional[float] = Query(None, description="Maximum price filter"),
min_bedrooms: Optional[int] = Query(None, description="Minimum bedrooms"),
max_bedrooms: Optional[int] = Query(None, description="Maximum bedrooms"),
waterfront: Optional[bool] = Query(None, description="Waterfront only"),
zipcode: Optional[str] = Query(None, description="Filter by zipcode"),
sample_size: Optional[int] = Query(1000, description="Number of records to return"),
random_seed: Optional[int] = Query(42, description="Random seed for sampling")
):
"""Get house data with optional filters for map visualization"""
conn = get_conn()
# Build WHERE clause
conditions = []
if min_price is not None:
conditions.append(f"price >= {min_price}")
if max_price is not None:
conditions.append(f"price <= {max_price}")
if min_bedrooms is not None:
conditions.append(f"bedrooms >= {min_bedrooms}")
if max_bedrooms is not None:
conditions.append(f"bedrooms <= {max_bedrooms}")
if waterfront is not None:
conditions.append(f"waterfront = {1 if waterfront else 0}")
if zipcode is not None:
conditions.append(f"zipcode = '{zipcode}'")
where_clause = "WHERE " + " AND ".join(conditions) if conditions else ""
# Query with optional sampling
query = f"""
SELECT
id, price, bedrooms, bathrooms, sqft_living, sqft_lot,
floors, waterfront, view, condition, grade, yr_built,
age, lat, long, zipcode
FROM houses
{where_clause}
USING SAMPLE {sample_size} (reservoir, {random_seed})
"""
result = conn.execute(query).fetchdf()
total_filtered = conn.execute(f"SELECT COUNT(*) FROM houses {where_clause}").fetchone()[0]
return {
"total_filtered": int(total_filtered),
"data": result.to_dict(orient='records')
}
@router.get("/statistics")
async def get_statistics(
group_by: Optional[str] = Query(None, description="Group by: bedrooms, zipcode, waterfront, grade"),
min_price: Optional[float] = Query(None),
max_price: Optional[float] = Query(None)
):
"""Get price statistics, optionally grouped"""
conn = get_conn()
# Build WHERE clause
conditions = []
if min_price is not None:
conditions.append(f"price >= {min_price}")
if max_price is not None:
conditions.append(f"price <= {max_price}")
where_clause = "WHERE " + " AND ".join(conditions) if conditions else ""
if group_by and group_by in ['bedrooms', 'zipcode', 'waterfront', 'grade']:
query = f"""
SELECT
{group_by},
COUNT(*) as count,
AVG(price) as mean,
MEDIAN(price) as median,
STDDEV(price) as std,
MIN(price) as min,
MAX(price) as max
FROM houses
{where_clause}
GROUP BY {group_by}
ORDER BY mean DESC
"""
result = conn.execute(query).fetchdf()
return {
"grouped_by": group_by,
"statistics": result.to_dict(orient='records')
}
else:
query = f"""
SELECT
COUNT(*) as count,
AVG(price) as mean,
MEDIAN(price) as median,
STDDEV(price) as std,
MIN(price) as min,
MAX(price) as max,
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY price) as p25,
PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY price) as p50,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY price) as p75,
PERCENTILE_CONT(0.90) WITHIN GROUP (ORDER BY price) as p90,
PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY price) as p95
FROM houses
{where_clause}
"""
result = conn.execute(query).fetchone()
return {
"count": int(result[0]),
"mean": float(result[1]),
"median": float(result[2]),
"std": float(result[3]) if result[3] else 0,
"min": float(result[4]),
"max": float(result[5]),
"percentiles": {
"25": float(result[6]),
"50": float(result[7]),
"75": float(result[8]),
"90": float(result[9]),
"95": float(result[10])
}
}
@router.post("/predict", response_model=PredictionResponse)
async def predict_price(request: PredictionRequest):
"""Predict house price based on features"""
model = get_model()
# Create input DataFrame for prediction
X = pd.DataFrame([[
request.bedrooms,
request.bathrooms,
request.sqft,
request.age
]], columns=['bedrooms', 'bathrooms', 'sqft', 'age'])
try:
predicted_price = model.predict(X)[0]
return PredictionResponse(
predicted_price=float(predicted_price),
formatted_price=f"${predicted_price:,.2f}"
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Prediction failed: {str(e)}")
@router.get("/price-distribution")
async def get_price_distribution(bins: int = Query(20, ge=5, le=50)):
"""Get price distribution for histogram"""
conn = get_conn()
# Get min/max for bin calculation
bounds = conn.execute("SELECT MIN(price), MAX(price) FROM houses").fetchone()
min_price, max_price = bounds[0], bounds[1]
bin_width = (max_price - min_price) / bins
query = f"""
SELECT
FLOOR((price - {min_price}) / {bin_width}) as bin_idx,
COUNT(*) as count
FROM houses
GROUP BY bin_idx
ORDER BY bin_idx
"""
result = conn.execute(query).fetchdf()
# Build histogram data
bin_edges = [min_price + i * bin_width for i in range(bins + 1)]
bin_centers = [(bin_edges[i] + bin_edges[i+1]) / 2 for i in range(bins)]
counts = [0] * bins
for _, row in result.iterrows():
idx = int(row['bin_idx'])
if 0 <= idx < bins:
counts[idx] = int(row['count'])
return {
"counts": counts,
"bin_edges": bin_edges,
"bin_centers": bin_centers
}
@router.get("/correlation")
async def get_correlation():
"""Get correlation matrix for numeric features"""
conn = get_conn()
numeric_cols = ['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
'floors', 'waterfront', 'view', 'condition', 'grade', 'age']
# DuckDB doesn't have a built-in CORR matrix, so compute pairwise
correlations = []
for col1 in numeric_cols:
row = []
for col2 in numeric_cols:
if col1 == col2:
row.append(1.0)
else:
corr = conn.execute(f"SELECT CORR({col1}, {col2}) FROM houses").fetchone()[0]
row.append(float(corr) if corr else 0.0)
correlations.append(row)
return {
"columns": numeric_cols,
"correlation": correlations
}
@router.get("/price-by-location")
async def get_price_by_location(
grid_size: int = Query(20, ge=5, le=50, description="Grid size for heatmap")
):
"""Get average prices by location grid for heatmap"""
conn = get_conn()
# Get bounds
bounds = conn.execute("""
SELECT MIN(lat), MAX(lat), MIN(long), MAX(long) FROM houses
""").fetchone()
lat_min, lat_max = bounds[0], bounds[1]
long_min, long_max = bounds[2], bounds[3]
lat_step = (lat_max - lat_min) / grid_size
long_step = (long_max - long_min) / grid_size
query = f"""
SELECT
FLOOR((lat - {lat_min}) / {lat_step}) as lat_bin,
FLOOR((long - {long_min}) / {long_step}) as long_bin,
AVG(price) as avg_price,
COUNT(*) as count
FROM houses
GROUP BY lat_bin, long_bin
"""
result = conn.execute(query).fetchdf()
# Convert bin indices to actual coordinates
data = []
for _, row in result.iterrows():
lat_bin = int(row['lat_bin']) if row['lat_bin'] < grid_size else grid_size - 1
long_bin = int(row['long_bin']) if row['long_bin'] < grid_size else grid_size - 1
data.append({
'lat': lat_min + (lat_bin + 0.5) * lat_step,
'long': long_min + (long_bin + 0.5) * long_step,
'avg_price': float(row['avg_price']),
'count': int(row['count'])
})
return {
"lat_range": [float(lat_min), float(lat_max)],
"long_range": [float(long_min), float(long_max)],
"data": data
}