386 lines
12 KiB
Python
386 lines
12 KiB
Python
"""
|
|
House Price Predictor API
|
|
Seattle/King County house price prediction and visualization
|
|
Using DuckDB for data operations
|
|
"""
|
|
from fastapi import APIRouter, Query, HTTPException
|
|
from pydantic import BaseModel
|
|
from typing import Optional
|
|
import duckdb
|
|
import pandas as pd
|
|
import numpy as np
|
|
import joblib
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
router = APIRouter()
|
|
|
|
# Paths
|
|
DATA_PATH = Path(__file__).parent.parent / "data" / "kc_house_data.csv"
|
|
MODEL_PATH = Path(__file__).parent.parent / "data" / "house_price_model.joblib"
|
|
|
|
# DuckDB connection and model cache
|
|
_conn: Optional[duckdb.DuckDBPyConnection] = None
|
|
_model = None
|
|
_current_year = datetime.now().year
|
|
|
|
|
|
def get_conn() -> duckdb.DuckDBPyConnection:
|
|
"""Get or create DuckDB connection with house data"""
|
|
global _conn
|
|
if _conn is None:
|
|
_conn = duckdb.connect(':memory:')
|
|
# Load CSV and create table with calculated age column
|
|
_conn.execute(f"""
|
|
CREATE TABLE houses AS
|
|
SELECT
|
|
*,
|
|
{_current_year} - yr_built AS age,
|
|
sqft_living AS sqft
|
|
FROM read_csv_auto('{DATA_PATH}')
|
|
""")
|
|
return _conn
|
|
|
|
|
|
def get_model():
|
|
"""Load and cache the prediction model"""
|
|
global _model
|
|
if _model is None:
|
|
import warnings
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("ignore")
|
|
_model = joblib.load(MODEL_PATH)
|
|
return _model
|
|
|
|
|
|
class PredictionRequest(BaseModel):
|
|
bedrooms: int
|
|
bathrooms: float
|
|
sqft: int
|
|
age: int
|
|
|
|
|
|
class PredictionResponse(BaseModel):
|
|
predicted_price: float
|
|
formatted_price: str
|
|
|
|
|
|
@router.get("/metadata")
|
|
async def get_metadata():
|
|
"""Get metadata about the house dataset"""
|
|
conn = get_conn()
|
|
|
|
# Get price stats
|
|
price_stats = conn.execute("""
|
|
SELECT
|
|
MIN(price) as min_price,
|
|
MAX(price) as max_price,
|
|
AVG(price) as mean_price,
|
|
MEDIAN(price) as median_price
|
|
FROM houses
|
|
""").fetchone()
|
|
|
|
# Get feature ranges
|
|
feature_stats = conn.execute("""
|
|
SELECT
|
|
MIN(bedrooms) as min_bed, MAX(bedrooms) as max_bed,
|
|
MIN(bathrooms) as min_bath, MAX(bathrooms) as max_bath,
|
|
MIN(sqft_living) as min_sqft, MAX(sqft_living) as max_sqft,
|
|
MIN(age) as min_age, MAX(age) as max_age
|
|
FROM houses
|
|
""").fetchone()
|
|
|
|
# Get location bounds
|
|
location_stats = conn.execute("""
|
|
SELECT
|
|
MIN(lat) as min_lat, MAX(lat) as max_lat,
|
|
MIN(long) as min_long, MAX(long) as max_long,
|
|
AVG(lat) as center_lat, AVG(long) as center_long
|
|
FROM houses
|
|
""").fetchone()
|
|
|
|
# Get zipcodes
|
|
zipcodes = conn.execute("SELECT DISTINCT zipcode FROM houses ORDER BY zipcode").fetchall()
|
|
|
|
# Get total count
|
|
total = conn.execute("SELECT COUNT(*) FROM houses").fetchone()[0]
|
|
|
|
return {
|
|
"total_records": total,
|
|
"price_range": {
|
|
"min": float(price_stats[0]),
|
|
"max": float(price_stats[1]),
|
|
"mean": float(price_stats[2]),
|
|
"median": float(price_stats[3])
|
|
},
|
|
"features": {
|
|
"bedrooms": {"min": int(feature_stats[0]), "max": int(feature_stats[1])},
|
|
"bathrooms": {"min": float(feature_stats[2]), "max": float(feature_stats[3])},
|
|
"sqft_living": {"min": int(feature_stats[4]), "max": int(feature_stats[5])},
|
|
"age": {"min": int(feature_stats[6]), "max": int(feature_stats[7])}
|
|
},
|
|
"location": {
|
|
"lat_range": [float(location_stats[0]), float(location_stats[1])],
|
|
"long_range": [float(location_stats[2]), float(location_stats[3])],
|
|
"center": [float(location_stats[4]), float(location_stats[5])]
|
|
},
|
|
"zipcodes": [z[0] for z in zipcodes],
|
|
"data_period": "2014-2015",
|
|
"region": "King County, Washington"
|
|
}
|
|
|
|
|
|
@router.get("/data")
|
|
async def get_house_data(
|
|
min_price: Optional[float] = Query(None, description="Minimum price filter"),
|
|
max_price: Optional[float] = Query(None, description="Maximum price filter"),
|
|
min_bedrooms: Optional[int] = Query(None, description="Minimum bedrooms"),
|
|
max_bedrooms: Optional[int] = Query(None, description="Maximum bedrooms"),
|
|
waterfront: Optional[bool] = Query(None, description="Waterfront only"),
|
|
zipcode: Optional[str] = Query(None, description="Filter by zipcode"),
|
|
sample_size: Optional[int] = Query(1000, description="Number of records to return"),
|
|
random_seed: Optional[int] = Query(42, description="Random seed for sampling")
|
|
):
|
|
"""Get house data with optional filters for map visualization"""
|
|
conn = get_conn()
|
|
|
|
# Build WHERE clause
|
|
conditions = []
|
|
if min_price is not None:
|
|
conditions.append(f"price >= {min_price}")
|
|
if max_price is not None:
|
|
conditions.append(f"price <= {max_price}")
|
|
if min_bedrooms is not None:
|
|
conditions.append(f"bedrooms >= {min_bedrooms}")
|
|
if max_bedrooms is not None:
|
|
conditions.append(f"bedrooms <= {max_bedrooms}")
|
|
if waterfront is not None:
|
|
conditions.append(f"waterfront = {1 if waterfront else 0}")
|
|
if zipcode is not None:
|
|
conditions.append(f"zipcode = '{zipcode}'")
|
|
|
|
where_clause = "WHERE " + " AND ".join(conditions) if conditions else ""
|
|
|
|
# Query with optional sampling
|
|
query = f"""
|
|
SELECT
|
|
id, price, bedrooms, bathrooms, sqft_living, sqft_lot,
|
|
floors, waterfront, view, condition, grade, yr_built,
|
|
age, lat, long, zipcode
|
|
FROM houses
|
|
{where_clause}
|
|
USING SAMPLE {sample_size} (reservoir, {random_seed})
|
|
"""
|
|
|
|
result = conn.execute(query).fetchdf()
|
|
total_filtered = conn.execute(f"SELECT COUNT(*) FROM houses {where_clause}").fetchone()[0]
|
|
|
|
return {
|
|
"total_filtered": int(total_filtered),
|
|
"data": result.to_dict(orient='records')
|
|
}
|
|
|
|
|
|
@router.get("/statistics")
|
|
async def get_statistics(
|
|
group_by: Optional[str] = Query(None, description="Group by: bedrooms, zipcode, waterfront, grade"),
|
|
min_price: Optional[float] = Query(None),
|
|
max_price: Optional[float] = Query(None)
|
|
):
|
|
"""Get price statistics, optionally grouped"""
|
|
conn = get_conn()
|
|
|
|
# Build WHERE clause
|
|
conditions = []
|
|
if min_price is not None:
|
|
conditions.append(f"price >= {min_price}")
|
|
if max_price is not None:
|
|
conditions.append(f"price <= {max_price}")
|
|
where_clause = "WHERE " + " AND ".join(conditions) if conditions else ""
|
|
|
|
if group_by and group_by in ['bedrooms', 'zipcode', 'waterfront', 'grade']:
|
|
query = f"""
|
|
SELECT
|
|
{group_by},
|
|
COUNT(*) as count,
|
|
AVG(price) as mean,
|
|
MEDIAN(price) as median,
|
|
STDDEV(price) as std,
|
|
MIN(price) as min,
|
|
MAX(price) as max
|
|
FROM houses
|
|
{where_clause}
|
|
GROUP BY {group_by}
|
|
ORDER BY mean DESC
|
|
"""
|
|
result = conn.execute(query).fetchdf()
|
|
return {
|
|
"grouped_by": group_by,
|
|
"statistics": result.to_dict(orient='records')
|
|
}
|
|
else:
|
|
query = f"""
|
|
SELECT
|
|
COUNT(*) as count,
|
|
AVG(price) as mean,
|
|
MEDIAN(price) as median,
|
|
STDDEV(price) as std,
|
|
MIN(price) as min,
|
|
MAX(price) as max,
|
|
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY price) as p25,
|
|
PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY price) as p50,
|
|
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY price) as p75,
|
|
PERCENTILE_CONT(0.90) WITHIN GROUP (ORDER BY price) as p90,
|
|
PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY price) as p95
|
|
FROM houses
|
|
{where_clause}
|
|
"""
|
|
result = conn.execute(query).fetchone()
|
|
return {
|
|
"count": int(result[0]),
|
|
"mean": float(result[1]),
|
|
"median": float(result[2]),
|
|
"std": float(result[3]) if result[3] else 0,
|
|
"min": float(result[4]),
|
|
"max": float(result[5]),
|
|
"percentiles": {
|
|
"25": float(result[6]),
|
|
"50": float(result[7]),
|
|
"75": float(result[8]),
|
|
"90": float(result[9]),
|
|
"95": float(result[10])
|
|
}
|
|
}
|
|
|
|
|
|
@router.post("/predict", response_model=PredictionResponse)
|
|
async def predict_price(request: PredictionRequest):
|
|
"""Predict house price based on features"""
|
|
model = get_model()
|
|
|
|
# Create input DataFrame for prediction
|
|
X = pd.DataFrame([[
|
|
request.bedrooms,
|
|
request.bathrooms,
|
|
request.sqft,
|
|
request.age
|
|
]], columns=['bedrooms', 'bathrooms', 'sqft', 'age'])
|
|
|
|
try:
|
|
predicted_price = model.predict(X)[0]
|
|
return PredictionResponse(
|
|
predicted_price=float(predicted_price),
|
|
formatted_price=f"${predicted_price:,.2f}"
|
|
)
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Prediction failed: {str(e)}")
|
|
|
|
|
|
@router.get("/price-distribution")
|
|
async def get_price_distribution(bins: int = Query(20, ge=5, le=50)):
|
|
"""Get price distribution for histogram"""
|
|
conn = get_conn()
|
|
|
|
# Get min/max for bin calculation
|
|
bounds = conn.execute("SELECT MIN(price), MAX(price) FROM houses").fetchone()
|
|
min_price, max_price = bounds[0], bounds[1]
|
|
bin_width = (max_price - min_price) / bins
|
|
|
|
query = f"""
|
|
SELECT
|
|
FLOOR((price - {min_price}) / {bin_width}) as bin_idx,
|
|
COUNT(*) as count
|
|
FROM houses
|
|
GROUP BY bin_idx
|
|
ORDER BY bin_idx
|
|
"""
|
|
result = conn.execute(query).fetchdf()
|
|
|
|
# Build histogram data
|
|
bin_edges = [min_price + i * bin_width for i in range(bins + 1)]
|
|
bin_centers = [(bin_edges[i] + bin_edges[i+1]) / 2 for i in range(bins)]
|
|
|
|
counts = [0] * bins
|
|
for _, row in result.iterrows():
|
|
idx = int(row['bin_idx'])
|
|
if 0 <= idx < bins:
|
|
counts[idx] = int(row['count'])
|
|
|
|
return {
|
|
"counts": counts,
|
|
"bin_edges": bin_edges,
|
|
"bin_centers": bin_centers
|
|
}
|
|
|
|
|
|
@router.get("/correlation")
|
|
async def get_correlation():
|
|
"""Get correlation matrix for numeric features"""
|
|
conn = get_conn()
|
|
|
|
numeric_cols = ['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
|
|
'floors', 'waterfront', 'view', 'condition', 'grade', 'age']
|
|
|
|
# DuckDB doesn't have a built-in CORR matrix, so compute pairwise
|
|
correlations = []
|
|
for col1 in numeric_cols:
|
|
row = []
|
|
for col2 in numeric_cols:
|
|
if col1 == col2:
|
|
row.append(1.0)
|
|
else:
|
|
corr = conn.execute(f"SELECT CORR({col1}, {col2}) FROM houses").fetchone()[0]
|
|
row.append(float(corr) if corr else 0.0)
|
|
correlations.append(row)
|
|
|
|
return {
|
|
"columns": numeric_cols,
|
|
"correlation": correlations
|
|
}
|
|
|
|
|
|
@router.get("/price-by-location")
|
|
async def get_price_by_location(
|
|
grid_size: int = Query(20, ge=5, le=50, description="Grid size for heatmap")
|
|
):
|
|
"""Get average prices by location grid for heatmap"""
|
|
conn = get_conn()
|
|
|
|
# Get bounds
|
|
bounds = conn.execute("""
|
|
SELECT MIN(lat), MAX(lat), MIN(long), MAX(long) FROM houses
|
|
""").fetchone()
|
|
|
|
lat_min, lat_max = bounds[0], bounds[1]
|
|
long_min, long_max = bounds[2], bounds[3]
|
|
lat_step = (lat_max - lat_min) / grid_size
|
|
long_step = (long_max - long_min) / grid_size
|
|
|
|
query = f"""
|
|
SELECT
|
|
FLOOR((lat - {lat_min}) / {lat_step}) as lat_bin,
|
|
FLOOR((long - {long_min}) / {long_step}) as long_bin,
|
|
AVG(price) as avg_price,
|
|
COUNT(*) as count
|
|
FROM houses
|
|
GROUP BY lat_bin, long_bin
|
|
"""
|
|
result = conn.execute(query).fetchdf()
|
|
|
|
# Convert bin indices to actual coordinates
|
|
data = []
|
|
for _, row in result.iterrows():
|
|
lat_bin = int(row['lat_bin']) if row['lat_bin'] < grid_size else grid_size - 1
|
|
long_bin = int(row['long_bin']) if row['long_bin'] < grid_size else grid_size - 1
|
|
data.append({
|
|
'lat': lat_min + (lat_bin + 0.5) * lat_step,
|
|
'long': long_min + (long_bin + 0.5) * long_step,
|
|
'avg_price': float(row['avg_price']),
|
|
'count': int(row['count'])
|
|
})
|
|
|
|
return {
|
|
"lat_range": [float(lat_min), float(lat_max)],
|
|
"long_range": [float(long_min), float(long_max)],
|
|
"data": data
|
|
}
|