In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, RobustScaler, PolynomialFeatures
C:\Users\Yasser\New folder\Lib\site-packages\pandas\core\arrays\masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
  from pandas.core import (
In [2]:
from sklearn.datasets import fetch_openml

boston = fetch_openml(name="boston", version=1, as_frame=True)
df = boston.data
df['Price'] = boston.target
df[:10]
Out[2]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT Price
0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296.0 15.3 396.90 4.98 24.0
1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242.0 17.8 396.90 9.14 21.6
2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242.0 17.8 392.83 4.03 34.7
3 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222.0 18.7 394.63 2.94 33.4
4 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222.0 18.7 396.90 5.33 36.2
5 0.02985 0.0 2.18 0 0.458 6.430 58.7 6.0622 3 222.0 18.7 394.12 5.21 28.7
6 0.08829 12.5 7.87 0 0.524 6.012 66.6 5.5605 5 311.0 15.2 395.60 12.43 22.9
7 0.14455 12.5 7.87 0 0.524 6.172 96.1 5.9505 5 311.0 15.2 396.90 19.15 27.1
8 0.21124 12.5 7.87 0 0.524 5.631 100.0 6.0821 5 311.0 15.2 386.63 29.93 16.5
9 0.17004 12.5 7.87 0 0.524 6.004 85.9 6.5921 5 311.0 15.2 386.71 17.10 18.9
In [3]:
print(boston.DESCR)
**Author**:   
**Source**: Unknown - Date unknown  
**Please cite**:   

The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
prices and the demand for clean air', J. Environ. Economics & Management,
vol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics
...', Wiley, 1980.   N.B. Various transformations are used in the table on
pages 244-261 of the latter.
Variables in order:
CRIM     per capita crime rate by town
ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
INDUS    proportion of non-retail business acres per town
CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
NOX      nitric oxides concentration (parts per 10 million)
RM       average number of rooms per dwelling
AGE      proportion of owner-occupied units built prior to 1940
DIS      weighted distances to five Boston employment centres
RAD      index of accessibility to radial highways
TAX      full-value property-tax rate per $10,000
PTRATIO  pupil-teacher ratio by town
B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
LSTAT    % lower status of the population
MEDV     Median value of owner-occupied homes in $1000's


Information about the dataset
CLASSTYPE: numeric
CLASSINDEX: last

Downloaded from openml.org.
In [4]:
# %pip install scikit-learn
In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   CRIM     506 non-null    float64 
 1   ZN       506 non-null    float64 
 2   INDUS    506 non-null    float64 
 3   CHAS     506 non-null    category
 4   NOX      506 non-null    float64 
 5   RM       506 non-null    float64 
 6   AGE      506 non-null    float64 
 7   DIS      506 non-null    float64 
 8   RAD      506 non-null    category
 9   TAX      506 non-null    float64 
 10  PTRATIO  506 non-null    float64 
 11  B        506 non-null    float64 
 12  LSTAT    506 non-null    float64 
 13  Price    506 non-null    float64 
dtypes: category(2), float64(12)
memory usage: 49.0 KB
In [6]:
df.shape
Out[6]:
(506, 14)
In [7]:
feature_names = df.columns.tolist()
print(feature_names)
['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'Price']
In [8]:
print(df.dtypes)
CRIM        float64
ZN          float64
INDUS       float64
CHAS       category
NOX         float64
RM          float64
AGE         float64
DIS         float64
RAD        category
TAX         float64
PTRATIO     float64
B           float64
LSTAT       float64
Price       float64
dtype: object
In [9]:
df.isnull().sum()
Out[9]:
CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
Price      0
dtype: int64
In [10]:
df.corr()
Out[10]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT Price
CRIM 1.000000 -0.200469 0.406583 -0.055892 0.420972 -0.219247 0.352734 -0.379670 0.625505 0.582764 0.289946 -0.385064 0.455621 -0.388305
ZN -0.200469 1.000000 -0.533828 -0.042697 -0.516604 0.311991 -0.569537 0.664408 -0.311948 -0.314563 -0.391679 0.175520 -0.412995 0.360445
INDUS 0.406583 -0.533828 1.000000 0.062938 0.763651 -0.391676 0.644779 -0.708027 0.595129 0.720760 0.383248 -0.356977 0.603800 -0.483725
CHAS -0.055892 -0.042697 0.062938 1.000000 0.091203 0.091251 0.086518 -0.099176 -0.007368 -0.035587 -0.121515 0.048788 -0.053929 0.175260
NOX 0.420972 -0.516604 0.763651 0.091203 1.000000 -0.302188 0.731470 -0.769230 0.611441 0.668023 0.188933 -0.380051 0.590879 -0.427321
RM -0.219247 0.311991 -0.391676 0.091251 -0.302188 1.000000 -0.240265 0.205246 -0.209847 -0.292048 -0.355501 0.128069 -0.613808 0.695360
AGE 0.352734 -0.569537 0.644779 0.086518 0.731470 -0.240265 1.000000 -0.747881 0.456022 0.506456 0.261515 -0.273534 0.602339 -0.376955
DIS -0.379670 0.664408 -0.708027 -0.099176 -0.769230 0.205246 -0.747881 1.000000 -0.494588 -0.534432 -0.232471 0.291512 -0.496996 0.249929
RAD 0.625505 -0.311948 0.595129 -0.007368 0.611441 -0.209847 0.456022 -0.494588 1.000000 0.910228 0.464741 -0.444413 0.488676 -0.381626
TAX 0.582764 -0.314563 0.720760 -0.035587 0.668023 -0.292048 0.506456 -0.534432 0.910228 1.000000 0.460853 -0.441808 0.543993 -0.468536
PTRATIO 0.289946 -0.391679 0.383248 -0.121515 0.188933 -0.355501 0.261515 -0.232471 0.464741 0.460853 1.000000 -0.177383 0.374044 -0.507787
B -0.385064 0.175520 -0.356977 0.048788 -0.380051 0.128069 -0.273534 0.291512 -0.444413 -0.441808 -0.177383 1.000000 -0.366087 0.333461
LSTAT 0.455621 -0.412995 0.603800 -0.053929 0.590879 -0.613808 0.602339 -0.496996 0.488676 0.543993 0.374044 -0.366087 1.000000 -0.737663
Price -0.388305 0.360445 -0.483725 0.175260 -0.427321 0.695360 -0.376955 0.249929 -0.381626 -0.468536 -0.507787 0.333461 -0.737663 1.000000
In [11]:
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap="Greys", fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation")
plt.show()
In [12]:
df.duplicated().sum()
Out[12]:
0
In [13]:
plt.figure(figsize=(8, 6))
sns.histplot(df['Price'], bins=30, kde=True, color='blue')
plt.title('Distribution of House Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()
C:\Users\Yasser\New folder\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
In [14]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x=df['RM'], y=df['Price'], color='blue')
plt.title('Price vs. Number of Rooms')
plt.xlabel('RM')
plt.ylabel('Price')
plt.show()
In [15]:
plt.figure(figsize=(8, 6))
sns.boxplot(x=df['CHAS'], y=df['Price'], palette='coolwarm')
plt.title('Prices Based on CHAS')
plt.xlabel('CHAS (1 = Near River, 0 = Not Near River)')
plt.ylabel('Price')
plt.show()
C:\Users\Yasser\New folder\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_vals = vals.groupby(grouper)
In [16]:
df.describe()
Out[16]:
CRIM ZN INDUS NOX RM AGE DIS TAX PTRATIO B LSTAT Price
count 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000
mean 3.613524 11.363636 11.136779 0.554695 6.284634 68.574901 3.795043 408.237154 18.455534 356.674032 12.653063 22.532806
std 8.601545 23.322453 6.860353 0.115878 0.702617 28.148861 2.105710 168.537116 2.164946 91.294864 7.141062 9.197104
min 0.006320 0.000000 0.460000 0.385000 3.561000 2.900000 1.129600 187.000000 12.600000 0.320000 1.730000 5.000000
25% 0.082045 0.000000 5.190000 0.449000 5.885500 45.025000 2.100175 279.000000 17.400000 375.377500 6.950000 17.025000
50% 0.256510 0.000000 9.690000 0.538000 6.208500 77.500000 3.207450 330.000000 19.050000 391.440000 11.360000 21.200000
75% 3.677083 12.500000 18.100000 0.624000 6.623500 94.075000 5.188425 666.000000 20.200000 396.225000 16.955000 25.000000
max 88.976200 100.000000 27.740000 0.871000 8.780000 100.000000 12.126500 711.000000 22.000000 396.900000 37.970000 50.000000
In [17]:
df = df.apply(pd.to_numeric, errors='coerce')
df.dropna(inplace=True)
In [18]:
x =df.drop(columns=['Price']) 
y =df['Price']  
In [19]:
scaler = RobustScaler()
x_scaled = scaler.fit_transform(x)
In [20]:
correlation_matrix = pd.DataFrame(x_scaled, columns=x.columns).corr().abs()
high_correlation_features = correlation_matrix.columns[correlation_matrix.mean() > 0.8] 
x_filtered = pd.DataFrame(x_scaled, columns=x.columns).drop(columns=high_correlation_features, axis=1)
In [21]:
poly = PolynomialFeatures(degree=2, include_bias=False)  
x_poly = poly.fit_transform(x_filtered)
In [22]:
x_train, x_test, y_train, y_test = train_test_split(x_poly, y, test_size=0.2, random_state=42)
In [23]:
model = LinearRegression()
scores = cross_val_score(model, x_train, y_train, cv=5, scoring='r2')
model.fit(x_train, y_train)
Out[23]:
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [24]:
y_pred = model.predict(x_test)
In [25]:
r2 = r2_score(y_test, y_pred)
print(f"R²: {r2}")
R²: 0.8055829447972154
In [26]:
predicted = y_pred  
expected =  y_test  
for p, e in zip(predicted[::5],   expected[::5]):  # Check for preforming result
   print(f'predicted: {p:.2f}, expected: {e:.2f}')
predicted: 27.02, expected: 23.60
predicted: 16.73, expected: 20.00
predicted: 18.04, expected: 21.50
predicted: 25.02, expected: 29.80
predicted: 24.70, expected: 25.20
predicted: 18.48, expected: 13.80
predicted: 20.77, expected: 23.10
predicted: 17.67, expected: 19.40
predicted: 16.83, expected: 19.70
predicted: 17.70, expected: 15.10
predicted: 22.38, expected: 18.90
predicted: 23.74, expected: 21.20
predicted: 23.35, expected: 25.00
predicted: 15.13, expected: 20.00
predicted: 21.63, expected: 20.00
predicted: 8.16, expected: 5.00
predicted: 17.71, expected: 20.90
predicted: 28.73, expected: 29.00
predicted: 26.02, expected: 23.20
predicted: 22.49, expected: 23.00
predicted: 23.91, expected: 22.50
In [27]:
d = pd.DataFrame({"Expected": y_test, "Predicted": y_pred})
In [28]:
import matplotlib.pyplot as plt
import seaborn as sns
figure = plt.figure(figsize=(9,9))
axes = sns.scatterplot(data=d, x="Expected",y="Predicted", hue="Predicted",
                       palette="cool", legend=False)
start = min(expected.min(), predicted.min())
end = max(expected.max(), predicted.max())
axes.set_xlim(start, end)
axes.set_ylim(start, end)
line = plt.plot([start, end], [start, end], "r--")
In [ ]: