34 lines
868 B
Python

from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import joblib
import pandas as pd
# Load dataset
data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target # in 100k USD
# Engineer features
df['square_feet'] = df['AveRooms'] * 350
df['bedrooms'] = df['AveBedrms']
df['bathrooms'] = df['AveRooms'] * 0.2
# Clean bathrooms
df['bathrooms'] = df['bathrooms'].clip(lower=1)
X = df[['square_feet', 'bedrooms', 'bathrooms']]
y = df['target']
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# Train model
model = LinearRegression()
model.fit(X_train, y_train)
# Need to be tested of course..: )
# Save model
joblib.dump(model, 'price_predictor.pkl')