34 lines
868 B
Python
34 lines
868 B
Python
from sklearn.datasets import fetch_california_housing
|
|
from sklearn.linear_model import LinearRegression
|
|
from sklearn.model_selection import train_test_split
|
|
import joblib
|
|
import pandas as pd
|
|
|
|
# Load dataset
|
|
data = fetch_california_housing()
|
|
df = pd.DataFrame(data.data, columns=data.feature_names)
|
|
df['target'] = data.target # in 100k USD
|
|
|
|
# Engineer features
|
|
df['square_feet'] = df['AveRooms'] * 350
|
|
df['bedrooms'] = df['AveBedrms']
|
|
df['bathrooms'] = df['AveRooms'] * 0.2
|
|
|
|
# Clean bathrooms
|
|
df['bathrooms'] = df['bathrooms'].clip(lower=1)
|
|
|
|
X = df[['square_feet', 'bedrooms', 'bathrooms']]
|
|
y = df['target']
|
|
|
|
# Train/test split
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
|
|
|
|
# Train model
|
|
model = LinearRegression()
|
|
model.fit(X_train, y_train)
|
|
|
|
# Need to be tested of course..: )
|
|
|
|
# Save model
|
|
joblib.dump(model, 'price_predictor.pkl')
|