""" House Price Predictor API Seattle/King County house price prediction and visualization Using DuckDB for data operations """ from fastapi import APIRouter, Query, HTTPException from pydantic import BaseModel from typing import Optional import duckdb import pandas as pd import numpy as np import joblib from pathlib import Path from datetime import datetime router = APIRouter() # Paths DATA_PATH = Path(__file__).parent.parent / "data" / "kc_house_data.csv" MODEL_PATH = Path(__file__).parent.parent / "data" / "house_price_model.joblib" # DuckDB connection and model cache _conn: Optional[duckdb.DuckDBPyConnection] = None _model = None _current_year = datetime.now().year def get_conn() -> duckdb.DuckDBPyConnection: """Get or create DuckDB connection with house data""" global _conn if _conn is None: _conn = duckdb.connect(':memory:') # Load CSV and create table with calculated age column _conn.execute(f""" CREATE TABLE houses AS SELECT *, {_current_year} - yr_built AS age, sqft_living AS sqft FROM read_csv_auto('{DATA_PATH}') """) return _conn def get_model(): """Load and cache the prediction model""" global _model if _model is None: import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") _model = joblib.load(MODEL_PATH) return _model class PredictionRequest(BaseModel): bedrooms: int bathrooms: float sqft: int age: int class PredictionResponse(BaseModel): predicted_price: float formatted_price: str @router.get("/metadata") async def get_metadata(): """Get metadata about the house dataset""" conn = get_conn() # Get price stats price_stats = conn.execute(""" SELECT MIN(price) as min_price, MAX(price) as max_price, AVG(price) as mean_price, MEDIAN(price) as median_price FROM houses """).fetchone() # Get feature ranges feature_stats = conn.execute(""" SELECT MIN(bedrooms) as min_bed, MAX(bedrooms) as max_bed, MIN(bathrooms) as min_bath, MAX(bathrooms) as max_bath, MIN(sqft_living) as min_sqft, MAX(sqft_living) as max_sqft, MIN(age) as min_age, MAX(age) as max_age FROM houses """).fetchone() # Get location bounds location_stats = conn.execute(""" SELECT MIN(lat) as min_lat, MAX(lat) as max_lat, MIN(long) as min_long, MAX(long) as max_long, AVG(lat) as center_lat, AVG(long) as center_long FROM houses """).fetchone() # Get zipcodes zipcodes = conn.execute("SELECT DISTINCT zipcode FROM houses ORDER BY zipcode").fetchall() # Get total count total = conn.execute("SELECT COUNT(*) FROM houses").fetchone()[0] return { "total_records": total, "price_range": { "min": float(price_stats[0]), "max": float(price_stats[1]), "mean": float(price_stats[2]), "median": float(price_stats[3]) }, "features": { "bedrooms": {"min": int(feature_stats[0]), "max": int(feature_stats[1])}, "bathrooms": {"min": float(feature_stats[2]), "max": float(feature_stats[3])}, "sqft_living": {"min": int(feature_stats[4]), "max": int(feature_stats[5])}, "age": {"min": int(feature_stats[6]), "max": int(feature_stats[7])} }, "location": { "lat_range": [float(location_stats[0]), float(location_stats[1])], "long_range": [float(location_stats[2]), float(location_stats[3])], "center": [float(location_stats[4]), float(location_stats[5])] }, "zipcodes": [z[0] for z in zipcodes], "data_period": "2014-2015", "region": "King County, Washington" } @router.get("/data") async def get_house_data( min_price: Optional[float] = Query(None, description="Minimum price filter"), max_price: Optional[float] = Query(None, description="Maximum price filter"), min_bedrooms: Optional[int] = Query(None, description="Minimum bedrooms"), max_bedrooms: Optional[int] = Query(None, description="Maximum bedrooms"), waterfront: Optional[bool] = Query(None, description="Waterfront only"), zipcode: Optional[str] = Query(None, description="Filter by zipcode"), sample_size: Optional[int] = Query(1000, description="Number of records to return"), random_seed: Optional[int] = Query(42, description="Random seed for sampling") ): """Get house data with optional filters for map visualization""" conn = get_conn() # Build WHERE clause conditions = [] if min_price is not None: conditions.append(f"price >= {min_price}") if max_price is not None: conditions.append(f"price <= {max_price}") if min_bedrooms is not None: conditions.append(f"bedrooms >= {min_bedrooms}") if max_bedrooms is not None: conditions.append(f"bedrooms <= {max_bedrooms}") if waterfront is not None: conditions.append(f"waterfront = {1 if waterfront else 0}") if zipcode is not None: conditions.append(f"zipcode = '{zipcode}'") where_clause = "WHERE " + " AND ".join(conditions) if conditions else "" # Query with optional sampling query = f""" SELECT id, price, bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterfront, view, condition, grade, yr_built, age, lat, long, zipcode FROM houses {where_clause} USING SAMPLE {sample_size} (reservoir, {random_seed}) """ result = conn.execute(query).fetchdf() total_filtered = conn.execute(f"SELECT COUNT(*) FROM houses {where_clause}").fetchone()[0] return { "total_filtered": int(total_filtered), "data": result.to_dict(orient='records') } @router.get("/statistics") async def get_statistics( group_by: Optional[str] = Query(None, description="Group by: bedrooms, zipcode, waterfront, grade"), min_price: Optional[float] = Query(None), max_price: Optional[float] = Query(None) ): """Get price statistics, optionally grouped""" conn = get_conn() # Build WHERE clause conditions = [] if min_price is not None: conditions.append(f"price >= {min_price}") if max_price is not None: conditions.append(f"price <= {max_price}") where_clause = "WHERE " + " AND ".join(conditions) if conditions else "" if group_by and group_by in ['bedrooms', 'zipcode', 'waterfront', 'grade']: query = f""" SELECT {group_by}, COUNT(*) as count, AVG(price) as mean, MEDIAN(price) as median, STDDEV(price) as std, MIN(price) as min, MAX(price) as max FROM houses {where_clause} GROUP BY {group_by} ORDER BY mean DESC """ result = conn.execute(query).fetchdf() return { "grouped_by": group_by, "statistics": result.to_dict(orient='records') } else: query = f""" SELECT COUNT(*) as count, AVG(price) as mean, MEDIAN(price) as median, STDDEV(price) as std, MIN(price) as min, MAX(price) as max, PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY price) as p25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY price) as p50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY price) as p75, PERCENTILE_CONT(0.90) WITHIN GROUP (ORDER BY price) as p90, PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY price) as p95 FROM houses {where_clause} """ result = conn.execute(query).fetchone() return { "count": int(result[0]), "mean": float(result[1]), "median": float(result[2]), "std": float(result[3]) if result[3] else 0, "min": float(result[4]), "max": float(result[5]), "percentiles": { "25": float(result[6]), "50": float(result[7]), "75": float(result[8]), "90": float(result[9]), "95": float(result[10]) } } @router.post("/predict", response_model=PredictionResponse) async def predict_price(request: PredictionRequest): """Predict house price based on features""" model = get_model() # Create input DataFrame for prediction X = pd.DataFrame([[ request.bedrooms, request.bathrooms, request.sqft, request.age ]], columns=['bedrooms', 'bathrooms', 'sqft', 'age']) try: predicted_price = model.predict(X)[0] return PredictionResponse( predicted_price=float(predicted_price), formatted_price=f"${predicted_price:,.2f}" ) except Exception as e: raise HTTPException(status_code=500, detail=f"Prediction failed: {str(e)}") @router.get("/price-distribution") async def get_price_distribution(bins: int = Query(20, ge=5, le=50)): """Get price distribution for histogram""" conn = get_conn() # Get min/max for bin calculation bounds = conn.execute("SELECT MIN(price), MAX(price) FROM houses").fetchone() min_price, max_price = bounds[0], bounds[1] bin_width = (max_price - min_price) / bins query = f""" SELECT FLOOR((price - {min_price}) / {bin_width}) as bin_idx, COUNT(*) as count FROM houses GROUP BY bin_idx ORDER BY bin_idx """ result = conn.execute(query).fetchdf() # Build histogram data bin_edges = [min_price + i * bin_width for i in range(bins + 1)] bin_centers = [(bin_edges[i] + bin_edges[i+1]) / 2 for i in range(bins)] counts = [0] * bins for _, row in result.iterrows(): idx = int(row['bin_idx']) if 0 <= idx < bins: counts[idx] = int(row['count']) return { "counts": counts, "bin_edges": bin_edges, "bin_centers": bin_centers } @router.get("/correlation") async def get_correlation(): """Get correlation matrix for numeric features""" conn = get_conn() numeric_cols = ['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'age'] # DuckDB doesn't have a built-in CORR matrix, so compute pairwise correlations = [] for col1 in numeric_cols: row = [] for col2 in numeric_cols: if col1 == col2: row.append(1.0) else: corr = conn.execute(f"SELECT CORR({col1}, {col2}) FROM houses").fetchone()[0] row.append(float(corr) if corr else 0.0) correlations.append(row) return { "columns": numeric_cols, "correlation": correlations } @router.get("/price-by-location") async def get_price_by_location( grid_size: int = Query(20, ge=5, le=50, description="Grid size for heatmap") ): """Get average prices by location grid for heatmap""" conn = get_conn() # Get bounds bounds = conn.execute(""" SELECT MIN(lat), MAX(lat), MIN(long), MAX(long) FROM houses """).fetchone() lat_min, lat_max = bounds[0], bounds[1] long_min, long_max = bounds[2], bounds[3] lat_step = (lat_max - lat_min) / grid_size long_step = (long_max - long_min) / grid_size query = f""" SELECT FLOOR((lat - {lat_min}) / {lat_step}) as lat_bin, FLOOR((long - {long_min}) / {long_step}) as long_bin, AVG(price) as avg_price, COUNT(*) as count FROM houses GROUP BY lat_bin, long_bin """ result = conn.execute(query).fetchdf() # Convert bin indices to actual coordinates data = [] for _, row in result.iterrows(): lat_bin = int(row['lat_bin']) if row['lat_bin'] < grid_size else grid_size - 1 long_bin = int(row['long_bin']) if row['long_bin'] < grid_size else grid_size - 1 data.append({ 'lat': lat_min + (lat_bin + 0.5) * lat_step, 'long': long_min + (long_bin + 0.5) * long_step, 'avg_price': float(row['avg_price']), 'count': int(row['count']) }) return { "lat_range": [float(lat_min), float(lat_max)], "long_range": [float(long_min), float(long_max)], "data": data }