import pandas as pd

data0 = pd.read_csv("Universal_Design_Space_Building_Energy_Simulation_input_output.csv",low_memory=False)
# data0.info()


data = data0.copy()
data.describe()


data.head()


data.columns

Index(['ID', 'BuildingType', 'ClimateZone', 'TotalArea', 'TotalArea_Setting',
       'FloorArea', 'FloorArea_Setting', 'NumFloors', 'PlateDepth',
       'PlateDepth_Setting', 'PlateLength', 'FloorHeight',
       'FloorHeight_Setting', 'Height', 'WWR', 'WWR_surfaces', 'SolarDesign',
       'Standard', 'HVAC', 'HVAC_Setting', 'EnvelopeQuality_Setting',
       'Wall_R_Value', 'Roof_R_Value', 'Glass_and_Frame_U_Value', 'SHGC',
       'LPD_Adjustment', 'LPD_Adjustment_Setting',
       'Interior_Lights_Final_W_per_sf', 'Exterior_Lights_Final_1_W',
       'Exterior_Lights_Final_2_W', 'Setpoint_Setting', 'HeatingCoil',
       'COP_Efficiency_Heating', 'CoolingCoil', 'COP_Efficiency_Cooling',
       'EUI_kBTU_per_sf', 'Electricity_Facility_kBTU_per_sf',
       'NaturalGas_Facility_kBTU_per_sf', 'Cooling_Electricity_kBTU_per_sf',
       'Heating_Electricity_kBTU_per_sf', 'Heating_NaturalGas_kBTU_per_sf',
       'Heating_Total_kBTU_per_sf', 'WaterSystems_Electricity_kBTU_per_sf',
       'Lighting_Electricity_kBTU_per_sf', 'Equipment_Electricity_kBTU_per_sf',
       'Fans_Electricity_kBTU_per_sf', 'Pumps_Electricity_kBTU_per_sf',
       'HeatRejection_Electricity_kBTU_per_sf',
       'HeatRecovery_Electricity_kBTU_per_sf'],
      dtype='object')


# Check unique values
# col = [i for i in data.columns]
# for i in col:
#     print(i,"---> \n",data[i].unique(),"\n")


data = data.drop(columns=["TotalArea_Setting","FloorArea_Setting","PlateDepth_Setting",
                          "FloorHeight_Setting","WWR_surfaces","Standard","HVAC",
                          "WaterSystems_Electricity_kBTU_per_sf","LPD_Adjustment",
                          "Setpoint_Setting"])


# Changing Dtype object to category to numeric

df2 = data.copy()
g = df2.columns.to_series().groupby(df2.dtypes).groups
a = {k.name: v for k, v in g.items()}
obls = []
flols = []
intls = []
for i in a['object']:
    obls.append(i)
for i in a['float64']:
    flols.append(i)
for i in a['int64']:
    intls.append(i)


import matplotlib.pyplot as plt
plt.figure(figsize=(15,5))

data.boxplot()
plt.xticks(rotation = 90)
plt.show()


plt.figure(figsize=(5,8))
data[['Exterior_Lights_Final_1_W','Exterior_Lights_Final_2_W']].boxplot()
plt.xticks(rotation = 45)

(array([1, 2]),
 [Text(1, 0, 'Exterior_Lights_Final_1_W'),
  Text(2, 0, 'Exterior_Lights_Final_2_W')])


import seaborn as sns
import matplotlib.pyplot as plt

for i in flols[5:10]:
    fig, ax = plt.subplots(1, 2,figsize=(15,5))

    ax[0].set_title(i)
    ax[1].set_title('Distribution plot')
    plt.show
    sns.boxplot(ax=ax[0], data=data[i], orient='h')
    sns.histplot(ax=ax[1], data=data[i])


# import seaborn as sns
# import matplotlib.pyplot as plt

# for i in flols:
#     fig, ax = plt.subplots(1, 2,figsize=(15,5))

#     ax[0].set_title('Boxplot')
#     ax[1].set_title('Distribution plot')

#     sns.boxplot(ax=ax[0], data=data[i], orient='h')
#     sns.histplot(ax=ax[1], data=data[i])


dfiqr = data.copy()

for i in intls:
    dfiqr = dfiqr.drop(columns=[i])
for j in obls:
    dfiqr = dfiqr.drop(columns=[j])


#q1,q3 =np.percentile(data,[25,75], axis=0)
tmp = dfiqr.quantile(([0.25,0.75]))
# print(tmp)
q1 = tmp.iloc[0,:]
q3 = tmp.iloc[1,:]


IQR = q3-q1
# print("IQR\n",IQR)
upper = q3+1.5*IQR
lower = q1-1.5*IQR
# print("The largest value in the data set \n",upper)
# print("The smallest value in the data set \n",lower)


# Find index to drop
result = dfiqr[((dfiqr>upper).any(axis = 1)) | ((dfiqr<lower).any(axis = 1))].index
result

Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            260738, 260739, 260740, 260741, 260742, 260743, 260744, 260745,
            260746, 260747],
           dtype='int64', length=158695)


dfiqr["ID"] = data["ID"]
dfiqrnew = dfiqr.drop(index=result)


dfiqrnew.head()


dfiqrnew2 = dfiqrnew.copy()
for i in flols:
    mu = dfiqrnew[i].mean()
    sd = dfiqrnew[i].std()
#     print(mu, sd)

    max = dfiqrnew[i].max()
    min = dfiqrnew[i].min()
#     print(max, min)

#     dfiqrnew[i] = dfiqrnew[i].apply(lambda x:(x-mu)/sd)
    dfiqrnew2[i] = dfiqrnew[i].apply(lambda x:(x-min)/(max-min))
#df['size'] = df['size'].apply(lambda x:(x-mu)/sd if sd!=0 else x)
dfiqrnew2.head()


data2 = data.copy()

for i in flols:
    data2 = data2.drop(columns=[i])
    
dfn = pd.merge(data2,dfiqrnew2, on = "ID",how = "inner")
dfn.head()


# Convert Dtype object to category to numeric
df2 = dfn.copy()

for i in obls:
    df2[i] = pd.Categorical(df2[i])
    df2[i] = df2[i].cat.codes
# df2.info()


# Convert to float
from sklearn.preprocessing import LabelEncoder

col = df2.columns
le = LabelEncoder()
for i in col:
    df2[i] = df2[i].astype('float')
# df2.info()


import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(24, 15))


data2 = df2.copy()
data2 = data2.iloc[:,1:27]
col = data2.columns
corr_sp = data2[col].corr(method='pearson')
# plot correlation
sns.heatmap(abs(corr_sp), annot=True, cmap="flare") # summer_r
plt.title('Correlation Heatmap', fontdict={'fontsize':24}, pad=12)
plt.show()


import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

dfnew = df2.copy()
X, y = dfnew.iloc[:,1:26],dfnew.iloc[:,-13]
data_dmatrix = xgb.DMatrix(data=X,label=y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=999)

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

xg_reg.fit(X_train,y_train)

preds = xg_reg.predict(X_test)


# reverse score
# mu = dfiqrnew['EUI_kBTU_per_sf'].mean()

max1 = dfiqrnew['EUI_kBTU_per_sf'].max()
min1 = dfiqrnew['EUI_kBTU_per_sf'].min()

y_test_new = (y_test*(max1-min1)) + min1
preds_new = (preds*(max1-min1)) + min1

print("Mean Absolute Error: ",mean_absolute_error(y_test_new, preds_new))

Mean Absolute Error:  9.72474943313263


X_test2 = X_test.copy()
X_test2['PREDICT'] = preds_new
X_test2['REAL DATA'] = data['EUI_kBTU_per_sf']
X_test2['ERROR'] = abs(X_test2['PREDICT'] - X_test2['REAL DATA'])

X_test2.iloc[:5,-3:]


print("Real data: ",data.iloc[36048]['EUI_kBTU_per_sf'])
print("Predict: ",X_test2.iloc[0]['PREDICT'])
print("Mean Absolute Error: ",mean_absolute_error(y_test_new, preds_new))

Real data:  68.2635568063008
Predict:  51.33594512939453
Mean Absolute Error:  9.72474943313263


df3 = y_test_new.to_frame()
df3 = df3.reset_index()

df3new = df3.drop(columns=["index"])
df3new['preds'] = preds_new
df3new['Error'] = abs(df3new['EUI_kBTU_per_sf'] - df3new['preds'])


from mpl_toolkits import mplot3d
import numpy as np
import matplotlib.pyplot as plt


x = df3new['EUI_kBTU_per_sf']
y = df3new['preds']
z = df3new['Error']
 

fig = plt.figure(figsize = (16, 9))
ax = plt.axes(projection ="3d")
   

ax.grid(b = True, color ='grey',
        linestyle ='-.', linewidth = 0.3,
        alpha = 0.2)


my_cmap = plt.get_cmap('inferno_r')
 

sctt = ax.scatter3D(x, y, z,
                    alpha = 0.8,
                    c = z,
                    cmap = my_cmap,
                    marker ='+',s=3)
 
plt.title("3D Scatter Plot", fontdict={'fontsize':18}, pad=12)
ax.set_xlabel('X - axis\n(Real data)', fontweight ='bold')
ax.set_ylabel('Y - axis\n(Predict data)', fontweight ='bold')
ax.set_zlabel('Z - axis\n(Error)', fontweight ='bold')
fig.colorbar(sctt, ax = ax, shrink = 0.5, aspect = 5)

plt.show()


plt.figure(figsize=(18,8))
# plt.plot(df3new.iloc[:100])
a = df3new.iloc[:100,0]
b = df3new.iloc[:100,1]
c = df3new.iloc[:100,2]
plt.plot(a, label ="Real data")
plt.plot(b, label ="Predict data")
plt.plot(c, label ="Error")
plt.title("Predict data, Real data", fontdict={'fontsize':18}, pad=12)

plt.legend(loc ="upper right")
plt.show()


import matplotlib.pyplot as plt
plt.figure(figsize=(8, 8))


x = df3new['EUI_kBTU_per_sf']
y = df3new['preds']
y = y.astype("float64")
plt.scatter(x, y, s= 1)

m, b = np.polyfit(x, y, 1)
plt.title("Scatter Plot: Predict data Vs Real data", fontdict={'fontsize':18}, pad=12)

plt.plot(x, m*x + b ,c = "red")
plt.show()

	TotalArea	FloorArea	NumFloors	PlateDepth	PlateLength	FloorHeight	Height	WWR	Wall_R_Value	Roof_R_Value	...	Heating_Electricity_kBTU_per_sf	Heating_NaturalGas_kBTU_per_sf	Heating_Total_kBTU_per_sf	WaterSystems_Electricity_kBTU_per_sf	Lighting_Electricity_kBTU_per_sf	Equipment_Electricity_kBTU_per_sf	Fans_Electricity_kBTU_per_sf	Pumps_Electricity_kBTU_per_sf	HeatRejection_Electricity_kBTU_per_sf	HeatRecovery_Electricity_kBTU_per_sf
count	2.607480e+05	260748.000000	260748.000000	260748.000000	260748.000000	260748.000000	260748.000000	260748.000000	260748.000000	260748.000000	...	260748.000000	260748.000000	260748.000000	260748.0	260748.000000	260748.000000	260748.000000	260748.000000	260748.000000	260748.000000
mean	1.994595e+05	25846.604507	8.866783	97.235906	209.707273	14.602774	129.436559	59.377107	22.190043	41.705422	...	0.537301	43.137857	43.675158	0.0	8.084788	38.817513	22.115252	2.509235	0.395789	4.833672
std	2.156952e+05	10350.688871	10.753368	25.367408	76.845830	1.970821	161.182146	17.593025	7.072464	14.076231	...	1.728176	97.983243	97.762052	0.0	4.609707	15.903090	32.499733	4.598613	1.391958	10.479290
min	3.994400e+04	14519.000000	1.000000	45.000000	129.000000	10.000000	13.000000	25.000000	8.330000	21.340000	...	0.000000	0.000000	0.000000	0.0	0.648153	11.218701	0.180277	0.000000	0.000000	0.000000
25%	1.016330e+05	14971.000000	4.000000	75.000000	140.000000	13.000000	52.000000	50.390625	16.330000	31.340000	...	0.000000	0.000000	0.232082	0.0	4.537323	24.806507	2.727917	0.016485	0.000000	0.000000
50%	1.497120e+05	24998.000000	6.000000	98.000000	197.000000	15.000000	85.000000	61.742424	20.330000	31.340000	...	0.000000	0.688570	1.768745	0.0	7.506889	41.608445	7.107739	1.186714	0.000000	0.084560
75%	2.018510e+05	39539.000000	10.000000	122.000000	238.000000	16.000000	135.000000	71.228070	30.000000	60.000000	...	0.071649	8.487569	9.333217	0.0	10.513514	52.364630	15.606273	2.963894	0.229866	1.419913
max	1.009256e+06	40434.000000	67.000000	153.000000	404.000000	18.000000	1072.000000	89.910314	30.000000	60.000000	...	39.978431	583.250051	583.250051	0.0	19.497817	68.665014	103.441310	34.478568	15.186499	33.107724

	ID	BuildingType	ClimateZone	TotalArea	TotalArea_Setting	FloorArea	FloorArea_Setting	NumFloors	PlateDepth	PlateDepth_Setting	...	Heating_Electricity_kBTU_per_sf	Heating_Total_kBTU_per_sf	Lighting_Electricity_kBTU_per_sf	Equipment_Electricity_kBTU_per_sf	Fans_Electricity_kBTU_per_sf	Pumps_Electricity_kBTU_per_sf	HeatRecovery_Electricity_kBTU_per_sf
0	College_1A_100000_14286_120_13_25_20_25_BaseLi...	College	1A	101793	low	14542	low	7	122	high	...	0.083671	0.083671	10.209886	44.423582	5.378116	4.748715	3.704229
1	College_1A_100000_14286_120_13_25_20_25_BaseLi...	College	1A	101793	low	14542	low	7	122	high	...	0.083671	0.083671	10.209886	44.423582	3.689256	4.675590	3.647120
2	College_1A_100000_14286_120_13_25_20_25_BaseLi...	College	1A	101793	low	14542	low	7	122	high	...	0.083856	0.083856	10.209886	44.423582	5.725152	4.350329	3.657521
3	College_1A_100000_14286_120_13_25_20_25_BaseLi...	College	1A	101793	low	14542	low	7	122	high	...	0.083856	0.083856	10.209886	44.423582	3.751486	4.224587	3.551803
4	College_1A_100000_14286_120_13_25_20_25_BaseLi...	College	1A	101793	low	14542	low	7	122	high	...	0.062975	0.062975	10.209886	44.423582	4.730901	4.131386	3.586181

	WWR	Wall_R_Value	Roof_R_Value	Glass_and_Frame_U_Value	SHGC	Interior_Lights_Final_W_per_sf	Exterior_Lights_Final_1_W	Exterior_Lights_Final_2_W	COP_Efficiency_Cooling	EUI_kBTU_per_sf	...	Heating_Electricity_kBTU_per_sf	Heating_Total_kBTU_per_sf	Lighting_Electricity_kBTU_per_sf	Equipment_Electricity_kBTU_per_sf	Fans_Electricity_kBTU_per_sf	Pumps_Electricity_kBTU_per_sf	HeatRecovery_Electricity_kBTU_per_sf	ID
11	25.000000	30.00	60.00	0.20	0.25	1.0	441.80	8552.0	3.552525	94.998433	...	0.050334	0.050334	10.209886	44.423582	3.296442	3.845008	3.337595	College_1A_100000_14286_120_13_25_20_25_BaseLi...
17	25.000000	30.00	60.00	0.20	0.25	1.0	441.80	8552.0	3.996591	89.394823	...	0.040897	0.040897	10.209886	44.423582	3.296442	3.845008	3.337595	College_1A_100000_14286_120_13_25_20_25_BaseLi...
23	25.000000	30.00	60.00	0.20	0.25	1.0	441.80	8552.0	4.440657	83.791214	...	0.031459	0.031459	10.209886	44.423582	3.296442	3.845008	3.337595	College_1A_100000_14286_120_13_25_20_25_BaseLi...
29	25.000000	8.33	21.34	0.66	0.25	0.4	176.72	3420.8	2.960438	94.982676	...	0.135453	0.135453	4.083955	44.423582	3.185673	3.685125	3.213624	College_1A_100000_14286_120_13_25_66_25_BestLi...
31	74.152542	8.33	21.34	0.66	0.25	0.4	176.72	3420.8	3.552525	93.761766	...	0.172216	0.172216	4.083955	44.423582	3.352672	4.427529	3.426734	College_1A_100000_14286_120_13_25_66_25_BestLi...

	WWR	Wall_R_Value	Roof_R_Value	Glass_and_Frame_U_Value	Interior_Lights_Final_W_per_sf	Exterior_Lights_Final_1_W	Exterior_Lights_Final_2_W	COP_Efficiency_Cooling	EUI_kBTU_per_sf	...	Heating_Electricity_kBTU_per_sf	Heating_Total_kBTU_per_sf	Lighting_Electricity_kBTU_per_sf	Equipment_Electricity_kBTU_per_sf	Fans_Electricity_kBTU_per_sf	Pumps_Electricity_kBTU_per_sf	HeatRecovery_Electricity_kBTU_per_sf	ID
11	0.000000	1.0	1.0	0.0	1.000000	0.034954	1.0	0.091653	0.582000	...	0.281023	0.002372	0.646387	0.578016	0.086309	0.723544	0.940548	College_1A_100000_14286_120_13_25_20_25_BaseLi...
17	0.000000	1.0	1.0	0.0	1.000000	0.034954	1.0	0.160393	0.537886	...	0.228331	0.001927	0.646387	0.578016	0.086309	0.723544	0.940548	College_1A_100000_14286_120_13_25_20_25_BaseLi...
23	0.000000	1.0	1.0	0.0	1.000000	0.034954	1.0	0.229133	0.493772	...	0.175639	0.001483	0.646387	0.578016	0.086309	0.723544	0.940548	College_1A_100000_14286_120_13_25_20_25_BaseLi...
29	0.000000	0.0	0.0	1.0	0.294118	0.010065	0.4	0.000000	0.581876	...	0.756249	0.006384	0.232265	0.578016	0.083109	0.693458	0.905612	College_1A_100000_14286_120_13_25_66_25_BestLi...
31	0.757238	0.0	0.0	1.0	0.294118	0.010065	0.4	0.091653	0.572265	...	0.961502	0.008116	0.232265	0.578016	0.087934	0.833161	0.965667	College_1A_100000_14286_120_13_25_66_25_BestLi...

	ID	BuildingType	ClimateZone	TotalArea	FloorArea	NumFloors	PlateDepth	PlateLength	FloorHeight	Height	...	Cooling_Electricity_kBTU_per_sf	Heating_Electricity_kBTU_per_sf	Heating_Total_kBTU_per_sf	Lighting_Electricity_kBTU_per_sf	Equipment_Electricity_kBTU_per_sf	Fans_Electricity_kBTU_per_sf	Pumps_Electricity_kBTU_per_sf	HeatRecovery_Electricity_kBTU_per_sf
0	College_1A_100000_14286_120_13_25_20_25_BaseLi...	College	1A	101793	14542	7	122	350	13	91	...	0.821321	0.281023	0.002372	0.646387	0.578016	0.086309	0.723544	0.940548
1	College_1A_100000_14286_120_13_25_20_25_BaseLi...	College	1A	101793	14542	7	122	350	13	91	...	0.667235	0.228331	0.001927	0.646387	0.578016	0.086309	0.723544	0.940548
2	College_1A_100000_14286_120_13_25_20_25_BaseLi...	College	1A	101793	14542	7	122	350	13	91	...	0.513149	0.175639	0.001483	0.646387	0.578016	0.086309	0.723544	0.940548
3	College_1A_100000_14286_120_13_25_66_25_BestLi...	College	1A	101793	14542	7	122	350	13	91	...	0.998144	0.756249	0.006384	0.232265	0.578016	0.083109	0.693458	0.905612
4	College_1A_100000_14286_120_13_25_66_25_BestLi...	College	1A	101793	14542	7	122	350	13	91	...	0.932584	0.961502	0.008116	0.232265	0.578016	0.087934	0.833161	0.965667

Energy Use Intensity¶

Table of Contents

Import data¶

Check values in each columns¶

EDA and Data preparation.¶

Cut off the outlier¶

Plot some column to explore outlier¶

IQR (Interquartile Range)¶

Min-Max Normalization¶

Heat map plot to show the data correlation¶

Predict by using XGboost¶

Show some of the predicted data¶

Mean Absolute Error¶

Show the first predicted data and MAE¶

3D Scatter Plot of Predict datas Vs Real data Vs Error¶

Line Plot of Predict datas, Real data, and Error with 100 observations¶

	PREDICT	REAL DATA	ERROR
36048	51.335945	68.263557	16.927612
94705	89.275932	70.950452	18.325480
21615	76.678230	64.576512	12.101718
1480	80.930763	84.433132	3.502369
92790	87.536217	59.044387	28.491830