{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[{"sourceId":5407,"databundleVersionId":868283,"sourceType":"competition"},{"sourceId":7643882,"sourceType":"datasetVersion","datasetId":4455375}],"dockerImageVersionId":30646,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"## House Price Prediction Competition","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19"}},{"cell_type":"markdown","source":"Welcome to our notebook focused on preparing a dataset for the House Price Prediction Competition. Our goal is to predict house prices based on their properties, aiming for accurate predictions in this competition. Dataset provided via the following link:  \nhttps://www.kaggle.com/competitions/house-prices-advanced-regression-techniques\n","metadata":{}},{"cell_type":"markdown","source":"### Read Data","metadata":{}},{"cell_type":"code","source":"import pandas as pd\n\ndf = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')\n\ndf.info()\n\nprint('='*50)\n\ndf.set_index('Id', inplace=True)\ndf","metadata":{"execution":{"iopub.status.busy":"2024-02-27T17:03:06.430994Z","iopub.execute_input":"2024-02-27T17:03:06.431525Z","iopub.status.idle":"2024-02-27T17:03:06.818799Z","shell.execute_reply.started":"2024-02-27T17:03:06.431486Z","shell.execute_reply":"2024-02-27T17:03:06.817844Z"},"trusted":true},"execution_count":2,"outputs":[{"name":"stdout","text":"<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 1460 entries, 0 to 1459\nData columns (total 81 columns):\n #   Column         Non-Null Count  Dtype  \n---  ------         --------------  -----  \n 0   Id             1460 non-null   int64  \n 1   MSSubClass     1460 non-null   int64  \n 2   MSZoning       1460 non-null   object \n 3   LotFrontage    1201 non-null   float64\n 4   LotArea        1460 non-null   int64  \n 5   Street         1460 non-null   object \n 6   Alley          91 non-null     object \n 7   LotShape       1460 non-null   object \n 8   LandContour    1460 non-null   object \n 9   Utilities      1460 non-null   object \n 10  LotConfig      1460 non-null   object \n 11  LandSlope      1460 non-null   object \n 12  Neighborhood   1460 non-null   object \n 13  Condition1     1460 non-null   object \n 14  Condition2     1460 non-null   object \n 15  BldgType       1460 non-null   object \n 16  HouseStyle     1460 non-null   object \n 17  OverallQual    1460 non-null   int64  \n 18  OverallCond    1460 non-null   int64  \n 19  YearBuilt      1460 non-null   int64  \n 20  YearRemodAdd   1460 non-null   int64  \n 21  RoofStyle      1460 non-null   object \n 22  RoofMatl       1460 non-null   object \n 23  Exterior1st    1460 non-null   object \n 24  Exterior2nd    1460 non-null   object \n 25  MasVnrType     588 non-null    object \n 26  MasVnrArea     1452 non-null   float64\n 27  ExterQual      1460 non-null   object \n 28  ExterCond      1460 non-null   object \n 29  Foundation     1460 non-null   object \n 30  BsmtQual       1423 non-null   object \n 31  BsmtCond       1423 non-null   object \n 32  BsmtExposure   1422 non-null   object \n 33  BsmtFinType1   1423 non-null   object \n 34  BsmtFinSF1     1460 non-null   int64  \n 35  BsmtFinType2   1422 non-null   object \n 36  BsmtFinSF2     1460 non-null   int64  \n 37  BsmtUnfSF      1460 non-null   int64  \n 38  TotalBsmtSF    1460 non-null   int64  \n 39  Heating        1460 non-null   object \n 40  HeatingQC      1460 non-null   object \n 41  CentralAir     1460 non-null   object \n 42  Electrical     1459 non-null   object \n 43  1stFlrSF       1460 non-null   int64  \n 44  2ndFlrSF       1460 non-null   int64  \n 45  LowQualFinSF   1460 non-null   int64  \n 46  GrLivArea      1460 non-null   int64  \n 47  BsmtFullBath   1460 non-null   int64  \n 48  BsmtHalfBath   1460 non-null   int64  \n 49  FullBath       1460 non-null   int64  \n 50  HalfBath       1460 non-null   int64  \n 51  BedroomAbvGr   1460 non-null   int64  \n 52  KitchenAbvGr   1460 non-null   int64  \n 53  KitchenQual    1460 non-null   object \n 54  TotRmsAbvGrd   1460 non-null   int64  \n 55  Functional     1460 non-null   object \n 56  Fireplaces     1460 non-null   int64  \n 57  FireplaceQu    770 non-null    object \n 58  GarageType     1379 non-null   object \n 59  GarageYrBlt    1379 non-null   float64\n 60  GarageFinish   1379 non-null   object \n 61  GarageCars     1460 non-null   int64  \n 62  GarageArea     1460 non-null   int64  \n 63  GarageQual     1379 non-null   object \n 64  GarageCond     1379 non-null   object \n 65  PavedDrive     1460 non-null   object \n 66  WoodDeckSF     1460 non-null   int64  \n 67  OpenPorchSF    1460 non-null   int64  \n 68  EnclosedPorch  1460 non-null   int64  \n 69  3SsnPorch      1460 non-null   int64  \n 70  ScreenPorch    1460 non-null   int64  \n 71  PoolArea       1460 non-null   int64  \n 72  PoolQC         7 non-null      object \n 73  Fence          281 non-null    object \n 74  MiscFeature    54 non-null     object \n 75  MiscVal        1460 non-null   int64  \n 76  MoSold         1460 non-null   int64  \n 77  YrSold         1460 non-null   int64  \n 78  SaleType       1460 non-null   object \n 79  SaleCondition  1460 non-null   object \n 80  SalePrice      1460 non-null   int64  \ndtypes: float64(3), int64(35), object(43)\nmemory usage: 924.0+ KB\n==================================================\n","output_type":"stream"},{"execution_count":2,"output_type":"execute_result","data":{"text/plain":"      MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \\\nId                                                                      \n1             60       RL         65.0     8450   Pave   NaN      Reg   \n2             20       RL         80.0     9600   Pave   NaN      Reg   \n3             60       RL         68.0    11250   Pave   NaN      IR1   \n4             70       RL         60.0     9550   Pave   NaN      IR1   \n5             60       RL         84.0    14260   Pave   NaN      IR1   \n...          ...      ...          ...      ...    ...   ...      ...   \n1456          60       RL         62.0     7917   Pave   NaN      Reg   \n1457          20       RL         85.0    13175   Pave   NaN      Reg   \n1458          70       RL         66.0     9042   Pave   NaN      Reg   \n1459          20       RL         68.0     9717   Pave   NaN      Reg   \n1460          20       RL         75.0     9937   Pave   NaN      Reg   \n\n     LandContour Utilities LotConfig  ... PoolArea PoolQC  Fence MiscFeature  \\\nId                                    ...                                      \n1            Lvl    AllPub    Inside  ...        0    NaN    NaN         NaN   \n2            Lvl    AllPub       FR2  ...        0    NaN    NaN         NaN   \n3            Lvl    AllPub    Inside  ...        0    NaN    NaN         NaN   \n4            Lvl    AllPub    Corner  ...        0    NaN    NaN         NaN   \n5            Lvl    AllPub       FR2  ...        0    NaN    NaN         NaN   \n...          ...       ...       ...  ...      ...    ...    ...         ...   \n1456         Lvl    AllPub    Inside  ...        0    NaN    NaN         NaN   \n1457         Lvl    AllPub    Inside  ...        0    NaN  MnPrv         NaN   \n1458         Lvl    AllPub    Inside  ...        0    NaN  GdPrv        Shed   \n1459         Lvl    AllPub    Inside  ...        0    NaN    NaN         NaN   \n1460         Lvl    AllPub    Inside  ...        0    NaN    NaN         NaN   \n\n     MiscVal MoSold  YrSold  SaleType  SaleCondition  SalePrice  \nId                                                               \n1          0      2    2008        WD         Normal     208500  \n2          0      5    2007        WD         Normal     181500  \n3          0      9    2008        WD         Normal     223500  \n4          0      2    2006        WD        Abnorml     140000  \n5          0     12    2008        WD         Normal     250000  \n...      ...    ...     ...       ...            ...        ...  \n1456       0      8    2007        WD         Normal     175000  \n1457       0      2    2010        WD         Normal     210000  \n1458    2500      5    2010        WD         Normal     266500  \n1459       0      4    2010        WD         Normal     142125  \n1460       0      6    2008        WD         Normal     147500  \n\n[1460 rows x 80 columns]","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>MSSubClass</th>\n      <th>MSZoning</th>\n      <th>LotFrontage</th>\n      <th>LotArea</th>\n      <th>Street</th>\n      <th>Alley</th>\n      <th>LotShape</th>\n      <th>LandContour</th>\n      <th>Utilities</th>\n      <th>LotConfig</th>\n      <th>...</th>\n      <th>PoolArea</th>\n      <th>PoolQC</th>\n      <th>Fence</th>\n      <th>MiscFeature</th>\n      <th>MiscVal</th>\n      <th>MoSold</th>\n      <th>YrSold</th>\n      <th>SaleType</th>\n      <th>SaleCondition</th>\n      <th>SalePrice</th>\n    </tr>\n    <tr>\n      <th>Id</th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>1</th>\n      <td>60</td>\n      <td>RL</td>\n      <td>65.0</td>\n      <td>8450</td>\n      <td>Pave</td>\n      <td>NaN</td>\n      <td>Reg</td>\n      <td>Lvl</td>\n      <td>AllPub</td>\n      <td>Inside</td>\n      <td>...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>0</td>\n      <td>2</td>\n      <td>2008</td>\n      <td>WD</td>\n      <td>Normal</td>\n      <td>208500</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>20</td>\n      <td>RL</td>\n      <td>80.0</td>\n      <td>9600</td>\n      <td>Pave</td>\n      <td>NaN</td>\n      <td>Reg</td>\n      <td>Lvl</td>\n      <td>AllPub</td>\n      <td>FR2</td>\n      <td>...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>0</td>\n      <td>5</td>\n      <td>2007</td>\n      <td>WD</td>\n      <td>Normal</td>\n      <td>181500</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>60</td>\n      <td>RL</td>\n      <td>68.0</td>\n      <td>11250</td>\n      <td>Pave</td>\n      <td>NaN</td>\n      <td>IR1</td>\n      <td>Lvl</td>\n      <td>AllPub</td>\n      <td>Inside</td>\n      <td>...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>0</td>\n      <td>9</td>\n      <td>2008</td>\n      <td>WD</td>\n      <td>Normal</td>\n      <td>223500</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>70</td>\n      <td>RL</td>\n      <td>60.0</td>\n      <td>9550</td>\n      <td>Pave</td>\n      <td>NaN</td>\n      <td>IR1</td>\n      <td>Lvl</td>\n      <td>AllPub</td>\n      <td>Corner</td>\n      <td>...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>0</td>\n      <td>2</td>\n      <td>2006</td>\n      <td>WD</td>\n      <td>Abnorml</td>\n      <td>140000</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>60</td>\n      <td>RL</td>\n      <td>84.0</td>\n      <td>14260</td>\n      <td>Pave</td>\n      <td>NaN</td>\n      <td>IR1</td>\n      <td>Lvl</td>\n      <td>AllPub</td>\n      <td>FR2</td>\n      <td>...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>0</td>\n      <td>12</td>\n      <td>2008</td>\n      <td>WD</td>\n      <td>Normal</td>\n      <td>250000</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>1456</th>\n      <td>60</td>\n      <td>RL</td>\n      <td>62.0</td>\n      <td>7917</td>\n      <td>Pave</td>\n      <td>NaN</td>\n      <td>Reg</td>\n      <td>Lvl</td>\n      <td>AllPub</td>\n      <td>Inside</td>\n      <td>...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>0</td>\n      <td>8</td>\n      <td>2007</td>\n      <td>WD</td>\n      <td>Normal</td>\n      <td>175000</td>\n    </tr>\n    <tr>\n      <th>1457</th>\n      <td>20</td>\n      <td>RL</td>\n      <td>85.0</td>\n      <td>13175</td>\n      <td>Pave</td>\n      <td>NaN</td>\n      <td>Reg</td>\n      <td>Lvl</td>\n      <td>AllPub</td>\n      <td>Inside</td>\n      <td>...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>MnPrv</td>\n      <td>NaN</td>\n      <td>0</td>\n      <td>2</td>\n      <td>2010</td>\n      <td>WD</td>\n      <td>Normal</td>\n      <td>210000</td>\n    </tr>\n    <tr>\n      <th>1458</th>\n      <td>70</td>\n      <td>RL</td>\n      <td>66.0</td>\n      <td>9042</td>\n      <td>Pave</td>\n      <td>NaN</td>\n      <td>Reg</td>\n      <td>Lvl</td>\n      <td>AllPub</td>\n      <td>Inside</td>\n      <td>...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>GdPrv</td>\n      <td>Shed</td>\n      <td>2500</td>\n      <td>5</td>\n      <td>2010</td>\n      <td>WD</td>\n      <td>Normal</td>\n      <td>266500</td>\n    </tr>\n    <tr>\n      <th>1459</th>\n      <td>20</td>\n      <td>RL</td>\n      <td>68.0</td>\n      <td>9717</td>\n      <td>Pave</td>\n      <td>NaN</td>\n      <td>Reg</td>\n      <td>Lvl</td>\n      <td>AllPub</td>\n      <td>Inside</td>\n      <td>...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>0</td>\n      <td>4</td>\n      <td>2010</td>\n      <td>WD</td>\n      <td>Normal</td>\n      <td>142125</td>\n    </tr>\n    <tr>\n      <th>1460</th>\n      <td>20</td>\n      <td>RL</td>\n      <td>75.0</td>\n      <td>9937</td>\n      <td>Pave</td>\n      <td>NaN</td>\n      <td>Reg</td>\n      <td>Lvl</td>\n      <td>AllPub</td>\n      <td>Inside</td>\n      <td>...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>0</td>\n      <td>6</td>\n      <td>2008</td>\n      <td>WD</td>\n      <td>Normal</td>\n      <td>147500</td>\n    </tr>\n  </tbody>\n</table>\n<p>1460 rows × 80 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"markdown","source":"### EDA  \nIn this EDA, we benefit descriptive statistics and plots to grasp data quality, identifying primary changes needed. For further insights into EDA, visit the following link:  \nhttps://www.kaggle.com/code/zahrazolghadr/eda-in-data-mining","metadata":{}},{"cell_type":"code","source":"!pip install ydata_profiling","metadata":{"execution":{"iopub.status.busy":"2024-02-27T11:59:28.649730Z","iopub.execute_input":"2024-02-27T11:59:28.650197Z","iopub.status.idle":"2024-02-27T11:59:42.862032Z","shell.execute_reply.started":"2024-02-27T11:59:28.650165Z","shell.execute_reply":"2024-02-27T11:59:42.860853Z"},"trusted":true},"execution_count":2,"outputs":[{"name":"stdout","text":"Requirement already satisfied: ydata_profiling in /opt/conda/lib/python3.10/site-packages (4.6.4)\nRequirement already satisfied: scipy<1.12,>=1.4.1 in /opt/conda/lib/python3.10/site-packages (from ydata_profiling) (1.11.4)\nRequirement already satisfied: pandas!=1.4.0,<3,>1.1 in /opt/conda/lib/python3.10/site-packages (from ydata_profiling) (2.2.0)\nRequirement already satisfied: matplotlib<3.9,>=3.2 in /opt/conda/lib/python3.10/site-packages (from ydata_profiling) (3.7.4)\nRequirement already satisfied: pydantic>=2 in /opt/conda/lib/python3.10/site-packages (from ydata_profiling) (2.5.3)\nRequirement already satisfied: PyYAML<6.1,>=5.0.0 in /opt/conda/lib/python3.10/site-packages (from ydata_profiling) (6.0.1)\nRequirement already satisfied: jinja2<3.2,>=2.11.1 in /opt/conda/lib/python3.10/site-packages (from ydata_profiling) (3.1.2)\nRequirement already satisfied: visions==0.7.5 in /opt/conda/lib/python3.10/site-packages (from visions[type_image_path]==0.7.5->ydata_profiling) (0.7.5)\nRequirement already satisfied: numpy<1.26,>=1.16.0 in /opt/conda/lib/python3.10/site-packages (from ydata_profiling) (1.24.4)\nRequirement already satisfied: htmlmin==0.1.12 in /opt/conda/lib/python3.10/site-packages (from ydata_profiling) (0.1.12)\nRequirement already satisfied: phik<0.13,>=0.11.1 in /opt/conda/lib/python3.10/site-packages (from ydata_profiling) (0.12.4)\nRequirement already satisfied: requests<3,>=2.24.0 in /opt/conda/lib/python3.10/site-packages (from ydata_profiling) (2.31.0)\nRequirement already satisfied: tqdm<5,>=4.48.2 in /opt/conda/lib/python3.10/site-packages (from ydata_profiling) (4.66.1)\nRequirement already satisfied: seaborn<0.13,>=0.10.1 in /opt/conda/lib/python3.10/site-packages (from ydata_profiling) (0.12.2)\nRequirement already satisfied: multimethod<2,>=1.4 in /opt/conda/lib/python3.10/site-packages (from ydata_profiling) (1.10)\nRequirement already satisfied: statsmodels<1,>=0.13.2 in /opt/conda/lib/python3.10/site-packages (from ydata_profiling) (0.14.1)\nRequirement already satisfied: typeguard<5,>=4.1.2 in /opt/conda/lib/python3.10/site-packages (from ydata_profiling) (4.1.5)\nRequirement already satisfied: imagehash==4.3.1 in /opt/conda/lib/python3.10/site-packages (from ydata_profiling) (4.3.1)\nRequirement already satisfied: wordcloud>=1.9.1 in /opt/conda/lib/python3.10/site-packages (from ydata_profiling) (1.9.3)\nRequirement already satisfied: dacite>=1.8 in /opt/conda/lib/python3.10/site-packages (from ydata_profiling) (1.8.1)\nRequirement already satisfied: numba<0.59.0,>=0.56.0 in /opt/conda/lib/python3.10/site-packages (from ydata_profiling) (0.58.1)\nRequirement already satisfied: PyWavelets in /opt/conda/lib/python3.10/site-packages (from imagehash==4.3.1->ydata_profiling) (1.5.0)\nRequirement already satisfied: pillow in /opt/conda/lib/python3.10/site-packages (from imagehash==4.3.1->ydata_profiling) (9.5.0)\nRequirement already satisfied: attrs>=19.3.0 in /opt/conda/lib/python3.10/site-packages (from visions==0.7.5->visions[type_image_path]==0.7.5->ydata_profiling) (23.2.0)\nRequirement already satisfied: networkx>=2.4 in /opt/conda/lib/python3.10/site-packages (from visions==0.7.5->visions[type_image_path]==0.7.5->ydata_profiling) (3.2.1)\nRequirement already satisfied: tangled-up-in-unicode>=0.0.4 in /opt/conda/lib/python3.10/site-packages (from visions==0.7.5->visions[type_image_path]==0.7.5->ydata_profiling) (0.2.0)\n\u001b[33mWARNING: visions 0.7.5 does not provide the extra 'type-image-path'\u001b[0m\u001b[33m\n\u001b[0mRequirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2<3.2,>=2.11.1->ydata_profiling) (2.1.3)\nRequirement already satisfied: contourpy>=1.0.1 in /opt/conda/lib/python3.10/site-packages (from matplotlib<3.9,>=3.2->ydata_profiling) (1.2.0)\nRequirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.10/site-packages (from matplotlib<3.9,>=3.2->ydata_profiling) (0.12.1)\nRequirement already satisfied: fonttools>=4.22.0 in /opt/conda/lib/python3.10/site-packages (from matplotlib<3.9,>=3.2->ydata_profiling) (4.47.0)\nRequirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/lib/python3.10/site-packages (from matplotlib<3.9,>=3.2->ydata_profiling) (1.4.5)\nRequirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from matplotlib<3.9,>=3.2->ydata_profiling) (21.3)\nRequirement already satisfied: pyparsing>=2.3.1 in /opt/conda/lib/python3.10/site-packages (from matplotlib<3.9,>=3.2->ydata_profiling) (3.1.1)\nRequirement already satisfied: python-dateutil>=2.7 in /opt/conda/lib/python3.10/site-packages (from matplotlib<3.9,>=3.2->ydata_profiling) (2.8.2)\nRequirement already satisfied: llvmlite<0.42,>=0.41.0dev0 in /opt/conda/lib/python3.10/site-packages (from numba<0.59.0,>=0.56.0->ydata_profiling) (0.41.1)\nRequirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.10/site-packages (from pandas!=1.4.0,<3,>1.1->ydata_profiling) (2023.3.post1)\nRequirement already satisfied: tzdata>=2022.7 in /opt/conda/lib/python3.10/site-packages (from pandas!=1.4.0,<3,>1.1->ydata_profiling) (2023.4)\nRequirement already satisfied: joblib>=0.14.1 in /opt/conda/lib/python3.10/site-packages (from phik<0.13,>=0.11.1->ydata_profiling) (1.3.2)\nRequirement already satisfied: annotated-types>=0.4.0 in /opt/conda/lib/python3.10/site-packages (from pydantic>=2->ydata_profiling) (0.6.0)\nRequirement already satisfied: pydantic-core==2.14.6 in /opt/conda/lib/python3.10/site-packages (from pydantic>=2->ydata_profiling) (2.14.6)\nRequirement already satisfied: typing-extensions>=4.6.1 in /opt/conda/lib/python3.10/site-packages (from pydantic>=2->ydata_profiling) (4.9.0)\nRequirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests<3,>=2.24.0->ydata_profiling) (3.3.2)\nRequirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests<3,>=2.24.0->ydata_profiling) (3.6)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests<3,>=2.24.0->ydata_profiling) (1.26.18)\nRequirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests<3,>=2.24.0->ydata_profiling) (2023.11.17)\nRequirement already satisfied: patsy>=0.5.4 in /opt/conda/lib/python3.10/site-packages (from statsmodels<1,>=0.13.2->ydata_profiling) (0.5.6)\nRequirement already satisfied: six in /opt/conda/lib/python3.10/site-packages (from patsy>=0.5.4->statsmodels<1,>=0.13.2->ydata_profiling) (1.16.0)\n","output_type":"stream"}]},{"cell_type":"code","source":"from ydata_profiling import ProfileReport\n\n# Generate a profile report\nprofile = ProfileReport(df, title=\"House Price Dataset\")\n\n# Save the report to an HTML file\nprofile.to_file(\"your_dataset_profile_report-1.html\")","metadata":{"execution":{"iopub.status.busy":"2024-02-27T12:21:50.949830Z","iopub.execute_input":"2024-02-27T12:21:50.950378Z","iopub.status.idle":"2024-02-27T12:25:57.163916Z","shell.execute_reply.started":"2024-02-27T12:21:50.950334Z","shell.execute_reply":"2024-02-27T12:25:57.162980Z"},"trusted":true},"execution_count":3,"outputs":[{"output_type":"display_data","data":{"text/plain":"Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"264f27983d2842ddbecf44bc284a9b0a"}},"metadata":{}},{"name":"stderr","text":"/opt/conda/lib/python3.10/site-packages/ydata_profiling/model/correlations.py:66: UserWarning: There was an attempt to calculate the auto correlation, but this failed.\nTo hide this warning, disable the calculation\n(using `df.profile_report(correlations={\"auto\": {\"calculate\": False}})`\nIf this is problematic for your use case, please report this as an issue:\nhttps://github.com/ydataai/ydata-profiling/issues\n(include the error message: 'could not convert string to float: 'Grvl'')\n  warnings.warn(\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"e207520801a4428686660bd2010c70d2"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"f4030dbd2a05424687af59810c585033"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"fbbecabf27cc4f1da932b18b97447cc8"}},"metadata":{}}]},{"cell_type":"markdown","source":"Ideas:  \n\nThere are a large number of missing values in some features.  \n\nThe dataset contains outliers.  \n\nCertain categorical features have a high number of distinct values.  \n\nSome categorical features have categories with low frequencies.  \n\nMost continuous features exhibit significant skewness.  \n\n...","metadata":{}},{"cell_type":"markdown","source":"### Data Cleaning  \nBased on findings from EDA, we implement data cleaning actions to enhance the primary data quality. For more insights on Data Cleaning, please visit the following link:  \nhttps://www.kaggle.com/code/zahrazolghadr/data-cleaning-in-dm-featurescreening-consistency","metadata":{}},{"cell_type":"markdown","source":"In certain features, we replace missing values with 'non-existent' since the item does not exist in those instances. For instance, a house without a pool would have undefined pool quality.","metadata":{}},{"cell_type":"code","source":"import pandas as pd\nimport numpy as np\n\ncolumns_list = ['Alley', 'BsmtQual', 'BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','FireplaceQu',\n                'GarageType','GarageFinish','GarageQual','GarageCond', 'PoolQC', 'Fence', 'MiscFeature']\n\n# Replacing \"Na\" with \"non-existent\" in specified columns\ndf[columns_list] = df[columns_list].replace(np.nan, 'non-existent')\ndf.info()","metadata":{"execution":{"iopub.status.busy":"2024-02-27T17:03:37.290677Z","iopub.execute_input":"2024-02-27T17:03:37.291461Z","iopub.status.idle":"2024-02-27T17:03:37.314073Z","shell.execute_reply.started":"2024-02-27T17:03:37.291432Z","shell.execute_reply":"2024-02-27T17:03:37.312787Z"},"trusted":true},"execution_count":3,"outputs":[{"name":"stdout","text":"<class 'pandas.core.frame.DataFrame'>\nIndex: 1460 entries, 1 to 1460\nData columns (total 80 columns):\n #   Column         Non-Null Count  Dtype  \n---  ------         --------------  -----  \n 0   MSSubClass     1460 non-null   int64  \n 1   MSZoning       1460 non-null   object \n 2   LotFrontage    1201 non-null   float64\n 3   LotArea        1460 non-null   int64  \n 4   Street         1460 non-null   object \n 5   Alley          1460 non-null   object \n 6   LotShape       1460 non-null   object \n 7   LandContour    1460 non-null   object \n 8   Utilities      1460 non-null   object \n 9   LotConfig      1460 non-null   object \n 10  LandSlope      1460 non-null   object \n 11  Neighborhood   1460 non-null   object \n 12  Condition1     1460 non-null   object \n 13  Condition2     1460 non-null   object \n 14  BldgType       1460 non-null   object \n 15  HouseStyle     1460 non-null   object \n 16  OverallQual    1460 non-null   int64  \n 17  OverallCond    1460 non-null   int64  \n 18  YearBuilt      1460 non-null   int64  \n 19  YearRemodAdd   1460 non-null   int64  \n 20  RoofStyle      1460 non-null   object \n 21  RoofMatl       1460 non-null   object \n 22  Exterior1st    1460 non-null   object \n 23  Exterior2nd    1460 non-null   object \n 24  MasVnrType     588 non-null    object \n 25  MasVnrArea     1452 non-null   float64\n 26  ExterQual      1460 non-null   object \n 27  ExterCond      1460 non-null   object \n 28  Foundation     1460 non-null   object \n 29  BsmtQual       1460 non-null   object \n 30  BsmtCond       1460 non-null   object \n 31  BsmtExposure   1460 non-null   object \n 32  BsmtFinType1   1460 non-null   object \n 33  BsmtFinSF1     1460 non-null   int64  \n 34  BsmtFinType2   1460 non-null   object \n 35  BsmtFinSF2     1460 non-null   int64  \n 36  BsmtUnfSF      1460 non-null   int64  \n 37  TotalBsmtSF    1460 non-null   int64  \n 38  Heating        1460 non-null   object \n 39  HeatingQC      1460 non-null   object \n 40  CentralAir     1460 non-null   object \n 41  Electrical     1459 non-null   object \n 42  1stFlrSF       1460 non-null   int64  \n 43  2ndFlrSF       1460 non-null   int64  \n 44  LowQualFinSF   1460 non-null   int64  \n 45  GrLivArea      1460 non-null   int64  \n 46  BsmtFullBath   1460 non-null   int64  \n 47  BsmtHalfBath   1460 non-null   int64  \n 48  FullBath       1460 non-null   int64  \n 49  HalfBath       1460 non-null   int64  \n 50  BedroomAbvGr   1460 non-null   int64  \n 51  KitchenAbvGr   1460 non-null   int64  \n 52  KitchenQual    1460 non-null   object \n 53  TotRmsAbvGrd   1460 non-null   int64  \n 54  Functional     1460 non-null   object \n 55  Fireplaces     1460 non-null   int64  \n 56  FireplaceQu    1460 non-null   object \n 57  GarageType     1460 non-null   object \n 58  GarageYrBlt    1379 non-null   float64\n 59  GarageFinish   1460 non-null   object \n 60  GarageCars     1460 non-null   int64  \n 61  GarageArea     1460 non-null   int64  \n 62  GarageQual     1460 non-null   object \n 63  GarageCond     1460 non-null   object \n 64  PavedDrive     1460 non-null   object \n 65  WoodDeckSF     1460 non-null   int64  \n 66  OpenPorchSF    1460 non-null   int64  \n 67  EnclosedPorch  1460 non-null   int64  \n 68  3SsnPorch      1460 non-null   int64  \n 69  ScreenPorch    1460 non-null   int64  \n 70  PoolArea       1460 non-null   int64  \n 71  PoolQC         1460 non-null   object \n 72  Fence          1460 non-null   object \n 73  MiscFeature    1460 non-null   object \n 74  MiscVal        1460 non-null   int64  \n 75  MoSold         1460 non-null   int64  \n 76  YrSold         1460 non-null   int64  \n 77  SaleType       1460 non-null   object \n 78  SaleCondition  1460 non-null   object \n 79  SalePrice      1460 non-null   int64  \ndtypes: float64(3), int64(34), object(43)\nmemory usage: 923.9+ KB\n","output_type":"stream"}]},{"cell_type":"markdown","source":"For the 'MSSubClass' feature, categories initially treated as numerical values are corrected to be considered as ordinal variables, grouped into values 1, 2, and 3.","metadata":{}},{"cell_type":"code","source":"import pandas as pd\n\n# Define the MSSubClass categories\nbins = [0, 60, 120, float('inf')]  # These are the bin edges\nlabels = ['1', '2', '3']  # These are the corresponding labels\n\n# Create a new column 'MSSubClass_category' with the specified categories\ndf['MSSubClass'] = pd.cut(df['MSSubClass'], bins=bins, labels=labels, right=False)\n\ndf['MSSubClass'].value_counts()\ndf['MSSubClass'].info()","metadata":{"execution":{"iopub.status.busy":"2024-02-27T17:03:46.816108Z","iopub.execute_input":"2024-02-27T17:03:46.818065Z","iopub.status.idle":"2024-02-27T17:03:46.831898Z","shell.execute_reply.started":"2024-02-27T17:03:46.818035Z","shell.execute_reply":"2024-02-27T17:03:46.831023Z"},"trusted":true},"execution_count":4,"outputs":[{"name":"stdout","text":"<class 'pandas.core.series.Series'>\nIndex: 1460 entries, 1 to 1460\nSeries name: MSSubClass\nNon-Null Count  Dtype   \n--------------  -----   \n1460 non-null   category\ndtypes: category(1)\nmemory usage: 13.0 KB\n","output_type":"stream"}]},{"cell_type":"markdown","source":"This is the time to separate features (X) from the target field (y).","metadata":{}},{"cell_type":"code","source":"y = df.SalePrice\nX = df.drop('SalePrice', axis=1)","metadata":{"execution":{"iopub.status.busy":"2024-02-27T17:03:50.990468Z","iopub.execute_input":"2024-02-27T17:03:50.990798Z","iopub.status.idle":"2024-02-27T17:03:50.998178Z","shell.execute_reply.started":"2024-02-27T17:03:50.990774Z","shell.execute_reply":"2024-02-27T17:03:50.997397Z"},"trusted":true},"execution_count":5,"outputs":[]},{"cell_type":"markdown","source":"We partition the data into training and testing sets to prevent data leakage, allocating 70% for training and 30% for testing.","metadata":{}},{"cell_type":"code","source":"from sklearn.model_selection import train_test_split\n\n\n# split into train and test sets\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=717)\n\nX_train.shape,X_test.shape","metadata":{"execution":{"iopub.status.busy":"2024-02-27T17:03:54.780154Z","iopub.execute_input":"2024-02-27T17:03:54.780486Z","iopub.status.idle":"2024-02-27T17:03:55.280851Z","shell.execute_reply.started":"2024-02-27T17:03:54.780463Z","shell.execute_reply":"2024-02-27T17:03:55.280229Z"},"trusted":true},"execution_count":6,"outputs":[{"execution_count":6,"output_type":"execute_result","data":{"text/plain":"((1022, 79), (438, 79))"},"metadata":{}}]},{"cell_type":"markdown","source":"After the above actions, for further investigation into the data, we conduct EDA on the X_train data, leveraging type schema to precisely determine the types of features.","metadata":{}},{"cell_type":"code","source":"from ydata_profiling import ProfileReport\n\n# Generate a profile report\nprofile = ProfileReport(X_train, title=\"House Price data EDA\", type_schema = {\"MSSubClass\": \"categorical\", \"OverallQual\": \"categorical\",\"OverallCond\": \"categorical\",\n                                                                        \"BsmtFullBath\":\"numeric\", \"BsmtHalfBath\":\"numeric\", \"FullBath\":\"numeric\",\n                                                                        \"HalfBath\":\"numeric\", \"Kitchen\":\"numeric\", \"Fireplaces\":\"numeric\",\n                                                                        \"GarageCars\":\"numeric\",\"YrSold\":\"numeric\"})\n\n# Save the report to an HTML file\nprofile.to_file(\"your_dataset_profile_report_2.html\")","metadata":{"execution":{"iopub.status.busy":"2024-02-27T12:46:25.439105Z","iopub.execute_input":"2024-02-27T12:46:25.439568Z","iopub.status.idle":"2024-02-27T12:51:08.151845Z","shell.execute_reply.started":"2024-02-27T12:46:25.439532Z","shell.execute_reply":"2024-02-27T12:51:08.150315Z"},"trusted":true},"execution_count":19,"outputs":[{"output_type":"display_data","data":{"text/plain":"Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"b3910f90b6674455bc120c6abdad7291"}},"metadata":{}},{"name":"stderr","text":"/opt/conda/lib/python3.10/site-packages/ydata_profiling/model/correlations.py:66: UserWarning: There was an attempt to calculate the auto correlation, but this failed.\nTo hide this warning, disable the calculation\n(using `df.profile_report(correlations={\"auto\": {\"calculate\": False}})`\nIf this is problematic for your use case, please report this as an issue:\nhttps://github.com/ydataai/ydata-profiling/issues\n(include the error message: 'could not convert string to float: 'non-existent'')\n  warnings.warn(\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"34a1281a5f17484199326d23e4b6e3b6"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"37062894f452495bab24df70cc242669"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"54710f0209ca4acfaa9a711b6d41f2ff"}},"metadata":{}}]},{"cell_type":"markdown","source":"In certain categorical features, we address imbalance severity by grouping categories with low frequencies as per the field's definition. Moreover, ordinal features are encoded with numerical values (0, 1, 2, etc.) post category grouping.","metadata":{}},{"cell_type":"code","source":"X_train['LotShape'] = X_train['LotShape'].replace(['IR1','IR2','IR3'],'IR')\n\nX_train['LandContour'] = X_train['LandContour'].replace(['Bnk','HLS','Low'],'Not Lvl')\n\nX_train['LotConfig'] = X_train['LotConfig'].replace(['CulDSac','FR2','FR3'],'Others')\n\nX_train['LandSlope'] = X_train['LandSlope'].replace(['Mod','Sev'],'Not Gtl')\n\nX_train['Condition1'] = X_train['Condition1'].apply(lambda x: 'Norm' if x == 'Norm' else 'Not Norm')\n\nX_train['Condition2'] = X_train['Condition2'].apply(lambda x: 'Norm' if x == 'Norm' else 'Not Norm')\n\nX_train['RoofStyle'] = X_train['RoofStyle'].apply(lambda x: 'Gable' if x == 'Gable' else 'Not Gable')\n\nX_train['MasVnrType'] = X_train['MasVnrType'].apply(lambda x: 'Stone' if x == 'Stone' else 'Brick')\n\nX_train['ExterQual'] = X_train['ExterQual'].replace({'non-existent': '0', 'Gd': '3', 'Ex': '3', 'Fa': '1', 'Po': '1', 'TA': '2'})\n\nX_train['ExterCond'] = X_train['ExterCond'].replace({'non-existent': '0', 'Gd': '3', 'Ex': '3', 'Fa': '1', 'Po': '1', 'TA': '2'})\n\nX_train['BsmtQual'] = X_train['BsmtQual'].replace({'non-existent': '0', 'Gd': '3', 'Ex': '3', 'Fa': '1', 'Po': '1', 'TA': '2'})\n\nX_train['BsmtCond'] = X_train['BsmtCond'].replace({'non-existent': '0', 'Gd': '3', 'Ex': '3', 'Fa': '1', 'Po': '1', 'TA': '2'})\n\nX_train['Heating'] = X_train['Heating'].apply(lambda x: 'GasA' if x == 'GasA' else 'Others')\n\nX_train['HeatingQC'] = X_train['HeatingQC'].replace({'non-existent': '0', 'Gd': '3', 'Ex': '3', 'Fa': '1', 'Po': '1', 'TA': '2'})\n\nX_train['Electrical'] = X_train['Electrical'].apply(lambda x: 'Standard' if x == 'SBrkr' else 'Not Standard')\n\nX_train['KitchenQual'] = X_train['KitchenQual'].replace({'non-existent': '0', 'Gd': '3', 'Ex': '3', 'Fa': '1', 'Po': '1', 'TA': '2'})\n\nX_train['Functional'] = X_train['Functional'].apply(lambda x: 'Typ' if x == 'Typ' else 'Not Typ')\n\nX_train['FireplaceQu'] = X_train['FireplaceQu'].replace({'non-existent': '0', 'Gd': '3', 'Ex': '3', 'Fa': '1', 'Po': '1', 'TA': '2'})\n\nX_train['GarageQual'] = X_train['GarageQual'].replace({'non-existent': '0', 'Gd': '3', 'Ex': '3', 'Fa': '1', 'Po': '1', 'TA': '2'})\n\nX_train['GarageCond'] = X_train['GarageCond'].replace({'non-existent': '0', 'Gd': '3', 'Ex': '3', 'Fa': '1', 'Po': '1', 'TA': '2'})\n\nX_train['BsmtExposure'] = X_train['BsmtExposure'].replace({'non-existent': '0', 'No': '1', 'Mn': '2', 'Av': '3', 'Gd': '4'})\n\nX_train['BsmtFinType1'] = X_train['BsmtFinType1'].replace({'non-existent': '0', 'Unf': '1', 'LwQ': '2', 'Rec': '3', 'BLQ': '4', 'ALQ': '5', 'GLQ': '6'})\n\nX_train['BsmtFinType2'] = X_train['BsmtFinType2'].replace({'non-existent': '0', 'Unf': '1', 'LwQ': '2', 'Rec': '3', 'BLQ': '4', 'ALQ': '5', 'GLQ': '6'})\n\nX_train['SaleType'] = X_train['SaleType'].apply(lambda x: 'WD' if x == 'WD' else 'Not WD')\n\nX_train['SaleCondition'] = X_train['SaleCondition'].apply(lambda x: 'Normal' if x == 'Normal' else 'Not Normal')","metadata":{"execution":{"iopub.status.busy":"2024-02-27T17:04:55.096403Z","iopub.execute_input":"2024-02-27T17:04:55.096962Z","iopub.status.idle":"2024-02-27T17:04:55.130643Z","shell.execute_reply.started":"2024-02-27T17:04:55.096937Z","shell.execute_reply":"2024-02-27T17:04:55.130007Z"},"trusted":true},"execution_count":7,"outputs":[]},{"cell_type":"markdown","source":"We construct age fields from year fields by subtracting the the year from maximum date, then dropping the year fields from the dataframe.","metadata":{}},{"cell_type":"code","source":"# Calculate ages by subtracting years from 2010\n\nX_train['AgeBuilt'] = 2010 - X_train['YearBuilt']\nX_train['AgeRemodAdd'] = 2010 - X_train['YearRemodAdd']\nX_train['AgeGarageBlt'] = 2010 - X_train['GarageYrBlt']\nX_train['AgeSold'] = 2010 - X_train['YrSold']\n\n\nX_train = X_train.drop(['YearBuilt','YearRemodAdd','GarageYrBlt','YrSold'], axis = 1)","metadata":{"execution":{"iopub.status.busy":"2024-02-27T17:05:02.600411Z","iopub.execute_input":"2024-02-27T17:05:02.600956Z","iopub.status.idle":"2024-02-27T17:05:02.609788Z","shell.execute_reply.started":"2024-02-27T17:05:02.600931Z","shell.execute_reply":"2024-02-27T17:05:02.609242Z"},"trusted":true},"execution_count":8,"outputs":[]},{"cell_type":"markdown","source":"### Data Cleaning- Feature Screening  \nIn this step, we exclude features lacking information such as those with low variance or numerous categories. For further insights on feature screening, please visit the following links:  \nhttps://www.kaggle.com/code/zahrazolghadr/data-cleaning-in-dm-featurescreening-consistency","metadata":{}},{"cell_type":"code","source":"categorical = X_train.select_dtypes(include=['object','category']).columns.tolist()\ncontinuous = X_train.select_dtypes(exclude=['object','category']).columns.tolist()\n\nlen(categorical), len(continuous)","metadata":{"execution":{"iopub.status.busy":"2024-02-27T17:05:10.000758Z","iopub.execute_input":"2024-02-27T17:05:10.001293Z","iopub.status.idle":"2024-02-27T17:05:10.009745Z","shell.execute_reply.started":"2024-02-27T17:05:10.001269Z","shell.execute_reply":"2024-02-27T17:05:10.008989Z"},"trusted":true},"execution_count":9,"outputs":[{"execution_count":9,"output_type":"execute_result","data":{"text/plain":"(44, 35)"},"metadata":{}}]},{"cell_type":"code","source":"# Define a minimum value for coefficient of variation\nmin_cv = 0.1\n\n# Calculate the coefficient of variation for each column\ncv_values = X_train[continuous].std() / X_train[continuous].mean()\n\n# Filter out columns with CV less than 0.1\nscreen_cv =  cv_values[cv_values < min_cv].index.tolist()","metadata":{"execution":{"iopub.status.busy":"2024-02-27T17:05:16.191440Z","iopub.execute_input":"2024-02-27T17:05:16.191745Z","iopub.status.idle":"2024-02-27T17:05:16.202758Z","shell.execute_reply.started":"2024-02-27T17:05:16.191721Z","shell.execute_reply":"2024-02-27T17:05:16.201296Z"},"trusted":true},"execution_count":10,"outputs":[]},{"cell_type":"code","source":"# Define a threshold for the dominant category percentage\nthreshold = 95\n\n# Calculate the percentage of the mode category for each column\nmode_category = (X_train[categorical].apply(lambda x: x.value_counts().max() / len(x)) * 100)\n\n# Select columns where the mode category percentage is greater than the threshold\nscreen_mode = mode_category[mode_category > threshold].index.tolist()","metadata":{"execution":{"iopub.status.busy":"2024-02-27T17:05:21.110484Z","iopub.execute_input":"2024-02-27T17:05:21.111338Z","iopub.status.idle":"2024-02-27T17:05:21.135756Z","shell.execute_reply.started":"2024-02-27T17:05:21.111312Z","shell.execute_reply":"2024-02-27T17:05:21.135106Z"},"trusted":true},"execution_count":11,"outputs":[]},{"cell_type":"code","source":"# Set a threshold for excluding columns \nthreshold = 90\n\n# Calculate the percentage of distinct categories in categorical variables\ndistinct_percentage = (X_train[categorical].apply(lambda x: x.dropna().nunique() / x.count()) * 100)\n\n# Select categorical columns based on distinct percentage threshold\nscreen_distinct = distinct_percentage[distinct_percentage > threshold].index.tolist()\n","metadata":{"execution":{"iopub.status.busy":"2024-02-27T17:05:25.540104Z","iopub.execute_input":"2024-02-27T17:05:25.540781Z","iopub.status.idle":"2024-02-27T17:05:25.563822Z","shell.execute_reply.started":"2024-02-27T17:05:25.540725Z","shell.execute_reply":"2024-02-27T17:05:25.563168Z"},"trusted":true},"execution_count":12,"outputs":[]},{"cell_type":"code","source":"X_train_screened = X_train.drop(set(screen_cv + screen_mode + screen_distinct), axis=1)\nX_train_screened.shape","metadata":{"execution":{"iopub.status.busy":"2024-02-27T17:05:32.891894Z","iopub.execute_input":"2024-02-27T17:05:32.892480Z","iopub.status.idle":"2024-02-27T17:05:32.901085Z","shell.execute_reply.started":"2024-02-27T17:05:32.892448Z","shell.execute_reply":"2024-02-27T17:05:32.900191Z"},"trusted":true},"execution_count":13,"outputs":[{"execution_count":13,"output_type":"execute_result","data":{"text/plain":"(1022, 72)"},"metadata":{}}]},{"cell_type":"markdown","source":"### Data Cleaning- Outliers  \nWe utilize the IsolationForest, a multi-dimensional method, to identify outliers and subsequently discard them.For further insights on outlier detection methods and their handling, please refer to the following link:  \nhttps://www.kaggle.com/code/zahrazolghadr/data-cleaning-in-dm-outliers-missing-values","metadata":{}},{"cell_type":"code","source":"import pandas as pd\nfrom sklearn.ensemble import IsolationForest\nfrom sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder\n\n\n# Replace rows with NaN valuse with mean and mode\ninputs_iso = X_train_screened.copy()\nfor col in inputs_iso.columns:\n    if col in continuous:\n        inputs_iso[col] = inputs_iso[col].fillna(inputs_iso[col].mean())\n    elif col in categorical:\n        mode_val = inputs_iso[col].mode().iloc[0]  # Extract mode value\n        inputs_iso[col] = inputs_iso[col].fillna(mode_val)\n\n\nordinal = ['MSSubClass', 'ExterQual', 'ExterCond','BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',\n           'BsmtFinType2','HeatingQC','KitchenQual','FireplaceQu','GarageQual', 'GarageCond']\n\nnominal = ['MSZoning','Alley','LotShape','LandContour','LotConfig','LandSlope','Neighborhood', 'Condition1','BldgType',\n           'HouseStyle', 'RoofStyle','Exterior1st', 'Exterior2nd', 'MasVnrType','Foundation','CentralAir',\n           'Functional','GarageType','GarageFinish','PavedDrive','Fence','SaleType', 'SaleCondition']\n\n# Apply encoding to categorical columns\n\nordinal_encoder = OrdinalEncoder()\ninputs_iso[ordinal] = ordinal_encoder.fit_transform(inputs_iso[ordinal])\n\none_hot_encoder = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)\none_hot_encoded = one_hot_encoder.fit_transform(inputs_iso[nominal])\n\none_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=one_hot_encoder.get_feature_names_out())\n\ninputs_iso_encoded = pd.concat([inputs_iso[ordinal].reset_index(), one_hot_encoded_df, inputs_iso[continuous].reset_index(drop=True)], axis=1) \n\n\n\n# Apply Z-score scaling to columns\nscaler = StandardScaler()\ninputs_iso_encoded_array = scaler.fit_transform(inputs_iso_encoded)\n\n\n# Fit Isolation Forest model\nclf = IsolationForest(contamination=0.02, random_state=42)\nclf.fit(inputs_iso_encoded_array)\n\n# Predict outliers\noutliers = clf.predict(inputs_iso_encoded_array)\n\n# Add the outlier predictions to your DataFrame\ninputs_iso_encoded['outlier'] = outliers\n\n# Display the DataFrame with outlier information\nprint(inputs_iso_encoded)\n\n# Calculate the percentage of outliers\npercentage_outliers = (outliers[outliers == -1].shape[0] / len(outliers)) * 100\nprint(f\"Percentage of outliers: {percentage_outliers:.2f}%\")","metadata":{"execution":{"iopub.status.busy":"2024-02-27T17:05:48.870941Z","iopub.execute_input":"2024-02-27T17:05:48.871236Z","iopub.status.idle":"2024-02-27T17:05:49.391327Z","shell.execute_reply.started":"2024-02-27T17:05:48.871214Z","shell.execute_reply":"2024-02-27T17:05:49.390262Z"},"trusted":true},"execution_count":14,"outputs":[{"name":"stdout","text":"        Id  MSSubClass  ExterQual  ExterCond  BsmtQual  BsmtCond  \\\n0      708         2.0        2.0        1.0       3.0       2.0   \n1     1237         2.0        2.0        1.0       3.0       2.0   \n2      962         1.0        1.0        2.0       3.0       3.0   \n3      255         0.0        1.0        2.0       2.0       2.0   \n4     1004         1.0        1.0        1.0       2.0       2.0   \n...    ...         ...        ...        ...       ...       ...   \n1017   348         0.0        1.0        1.0       2.0       2.0   \n1018  1101         0.0        1.0        1.0       2.0       1.0   \n1019    14         0.0        2.0        1.0       3.0       2.0   \n1020  1426         0.0        2.0        1.0       2.0       2.0   \n1021   132         1.0        2.0        1.0       3.0       2.0   \n\n      BsmtExposure  BsmtFinType1  BsmtFinType2  HeatingQC  ...  3SsnPorch  \\\n0              1.0           6.0           1.0        2.0  ...          0   \n1              1.0           1.0           1.0        2.0  ...          0   \n2              1.0           5.0           1.0        1.0  ...          0   \n3              1.0           3.0           1.0        1.0  ...          0   \n4              1.0           1.0           1.0        0.0  ...          0   \n...            ...           ...           ...        ...  ...        ...   \n1017           1.0           4.0           1.0        2.0  ...          0   \n1018           1.0           3.0           1.0        1.0  ...          0   \n1019           3.0           1.0           1.0        2.0  ...          0   \n1020           1.0           1.0           1.0        2.0  ...          0   \n1021           1.0           6.0           1.0        2.0  ...          0   \n\n      ScreenPorch  PoolArea  MiscVal  MoSold  AgeBuilt  AgeRemodAdd  \\\n0               0         0        0      12         4            4   \n1               0         0        0       6         7            7   \n2               0         0        0       7        33           15   \n3               0         0        0       6        53           53   \n4               0         0        0       6        34           34   \n...           ...       ...      ...     ...       ...          ...   \n1017            0         0        0      12        50           50   \n1018            0         0        0       1        90           60   \n1019            0         0        0       8         4            3   \n1020            0         0        0      10        51           51   \n1021            0         0        0       7        10           10   \n\n      AgeGarageBlt  AgeSold  outlier  \n0              4.0        1        1  \n1              7.0        0        1  \n2             33.0        2        1  \n3             53.0        0        1  \n4             34.0        3        1  \n...            ...      ...      ...  \n1017          50.0        1        1  \n1018          80.0        1        1  \n1019           4.0        3        1  \n1020          50.0        2        1  \n1021          10.0        1        1  \n\n[1022 rows x 150 columns]\nPercentage of outliers: 2.05%\n","output_type":"stream"}]},{"cell_type":"code","source":"outlier_index = inputs_iso_encoded[inputs_iso_encoded['outlier'] == -1]['Id']\n\nX_train_outprep = X_train_screened.drop(outlier_index.tolist())\n\ny_train_outprep = y_train.drop(outlier_index.tolist())\n\nX_train_outprep.info()","metadata":{"execution":{"iopub.status.busy":"2024-02-27T17:23:38.256717Z","iopub.execute_input":"2024-02-27T17:23:38.257053Z","iopub.status.idle":"2024-02-27T17:23:38.279741Z","shell.execute_reply.started":"2024-02-27T17:23:38.257029Z","shell.execute_reply":"2024-02-27T17:23:38.276701Z"},"trusted":true},"execution_count":28,"outputs":[{"name":"stdout","text":"<class 'pandas.core.frame.DataFrame'>\nIndex: 1001 entries, 708 to 132\nData columns (total 72 columns):\n #   Column         Non-Null Count  Dtype   \n---  ------         --------------  -----   \n 0   MSSubClass     1001 non-null   category\n 1   MSZoning       1001 non-null   object  \n 2   LotFrontage    822 non-null    float64 \n 3   LotArea        1001 non-null   int64   \n 4   Alley          1001 non-null   object  \n 5   LotShape       1001 non-null   object  \n 6   LandContour    1001 non-null   object  \n 7   LotConfig      1001 non-null   object  \n 8   LandSlope      1001 non-null   object  \n 9   Neighborhood   1001 non-null   object  \n 10  Condition1     1001 non-null   object  \n 11  BldgType       1001 non-null   object  \n 12  HouseStyle     1001 non-null   object  \n 13  OverallQual    1001 non-null   int64   \n 14  OverallCond    1001 non-null   int64   \n 15  RoofStyle      1001 non-null   object  \n 16  Exterior1st    1001 non-null   object  \n 17  Exterior2nd    1001 non-null   object  \n 18  MasVnrType     1001 non-null   object  \n 19  MasVnrArea     994 non-null    float64 \n 20  ExterQual      1001 non-null   object  \n 21  ExterCond      1001 non-null   object  \n 22  Foundation     1001 non-null   object  \n 23  BsmtQual       1001 non-null   object  \n 24  BsmtCond       1001 non-null   object  \n 25  BsmtExposure   1001 non-null   object  \n 26  BsmtFinType1   1001 non-null   object  \n 27  BsmtFinSF1     1001 non-null   int64   \n 28  BsmtFinType2   1001 non-null   object  \n 29  BsmtFinSF2     1001 non-null   int64   \n 30  BsmtUnfSF      1001 non-null   int64   \n 31  TotalBsmtSF    1001 non-null   int64   \n 32  HeatingQC      1001 non-null   object  \n 33  CentralAir     1001 non-null   object  \n 34  Electrical     1001 non-null   object  \n 35  1stFlrSF       1001 non-null   int64   \n 36  2ndFlrSF       1001 non-null   int64   \n 37  LowQualFinSF   1001 non-null   int64   \n 38  GrLivArea      1001 non-null   int64   \n 39  BsmtFullBath   1001 non-null   int64   \n 40  BsmtHalfBath   1001 non-null   int64   \n 41  FullBath       1001 non-null   int64   \n 42  HalfBath       1001 non-null   int64   \n 43  BedroomAbvGr   1001 non-null   int64   \n 44  KitchenAbvGr   1001 non-null   int64   \n 45  KitchenQual    1001 non-null   object  \n 46  TotRmsAbvGrd   1001 non-null   int64   \n 47  Functional     1001 non-null   object  \n 48  Fireplaces     1001 non-null   int64   \n 49  FireplaceQu    1001 non-null   object  \n 50  GarageType     1001 non-null   object  \n 51  GarageFinish   1001 non-null   object  \n 52  GarageCars     1001 non-null   int64   \n 53  GarageArea     1001 non-null   int64   \n 54  GarageQual     1001 non-null   object  \n 55  GarageCond     1001 non-null   object  \n 56  PavedDrive     1001 non-null   object  \n 57  WoodDeckSF     1001 non-null   int64   \n 58  OpenPorchSF    1001 non-null   int64   \n 59  EnclosedPorch  1001 non-null   int64   \n 60  3SsnPorch      1001 non-null   int64   \n 61  ScreenPorch    1001 non-null   int64   \n 62  PoolArea       1001 non-null   int64   \n 63  Fence          1001 non-null   object  \n 64  MiscVal        1001 non-null   int64   \n 65  MoSold         1001 non-null   int64   \n 66  SaleType       1001 non-null   object  \n 67  SaleCondition  1001 non-null   object  \n 68  AgeBuilt       1001 non-null   int64   \n 69  AgeRemodAdd    1001 non-null   int64   \n 70  AgeGarageBlt   956 non-null    float64 \n 71  AgeSold        1001 non-null   int64   \ndtypes: category(1), float64(3), int64(32), object(36)\nmemory usage: 564.2+ KB\n","output_type":"stream"}]},{"cell_type":"markdown","source":"### Data Cleaning- Missing Values  \nTo handle missing values, we generate a report on rows and columns with missing values, employing KNNImputer as a multi-dimensional method for imputation. For more insights on handling missing values, please refer to the following link:  \nhttps://www.kaggle.com/code/zahrazolghadr/data-cleaning-in-dm-outliers-missing-values\n","metadata":{}},{"cell_type":"code","source":"# Create a new column with the number of missing values in each row\nX_train_outprep['Num_Missing_Values'] = X_train_outprep.isnull().sum(axis=1)\n\n# Count and percentage of rows with missing values\nrows_with_missing_values = X_train_outprep[X_train_outprep['Num_Missing_Values'] > 0]\n\ntotal_rows = len(X_train_outprep)\nrows_with_missing_count = len(rows_with_missing_values)\npercentage_rows_with_missing = (rows_with_missing_count / total_rows) * 100\n\n# Display the report\nprint(\"Report on Rows with Missing Values:\")\nprint(f\"Total Rows: {total_rows}\")\nprint(f\"Rows with Missing Values: {rows_with_missing_count} ({percentage_rows_with_missing:.2f}%)\")\n\nprint('='*50)\n\nX_train_outprep['Num_Missing_Values'].describe(percentiles=[0.5, 0.75, 0.9, 0.95, 0.99])","metadata":{"execution":{"iopub.status.busy":"2024-02-27T17:23:44.566055Z","iopub.execute_input":"2024-02-27T17:23:44.566596Z","iopub.status.idle":"2024-02-27T17:23:44.581940Z","shell.execute_reply.started":"2024-02-27T17:23:44.566571Z","shell.execute_reply":"2024-02-27T17:23:44.581167Z"},"trusted":true},"execution_count":29,"outputs":[{"name":"stdout","text":"Report on Rows with Missing Values:\nTotal Rows: 1001\nRows with Missing Values: 225 (22.48%)\n==================================================\n","output_type":"stream"},{"execution_count":29,"output_type":"execute_result","data":{"text/plain":"count    1001.000000\nmean        0.230769\nstd         0.435537\nmin         0.000000\n50%         0.000000\n75%         0.000000\n90%         1.000000\n95%         1.000000\n99%         1.000000\nmax         2.000000\nName: Num_Missing_Values, dtype: float64"},"metadata":{}}]},{"cell_type":"code","source":"discard_missing_row = X_train_outprep[X_train_outprep['Num_Missing_Values'] > 35].index.tolist()\n\nX_train_outprep = X_train_outprep.drop(discard_missing_row)\ny_train_outprep = y_train_outprep.drop(discard_missing_row)\n\nX_train_outprep = X_train_outprep.drop(['Num_Missing_Values'], axis=1)","metadata":{"execution":{"iopub.status.busy":"2024-02-27T17:23:56.567364Z","iopub.execute_input":"2024-02-27T17:23:56.567717Z","iopub.status.idle":"2024-02-27T17:23:56.579232Z","shell.execute_reply.started":"2024-02-27T17:23:56.567692Z","shell.execute_reply":"2024-02-27T17:23:56.578267Z"},"trusted":true},"execution_count":30,"outputs":[]},{"cell_type":"code","source":"# Report on count and percentage of missing values in each column\nmissing_values_report = pd.DataFrame({\n    'Column': X_train_outprep.columns,\n    'Missing Values': X_train_outprep.isnull().sum(),\n    'Percentage Missing': X_train_outprep.isnull().mean() * 100\n})\n\n# Display the missing values report\nprint(\"Missing Values Report:\")\nprint(missing_values_report.sort_values(by='Percentage Missing', ascending = False).head(15))","metadata":{"execution":{"iopub.status.busy":"2024-02-27T17:24:00.525778Z","iopub.execute_input":"2024-02-27T17:24:00.526122Z","iopub.status.idle":"2024-02-27T17:24:00.543695Z","shell.execute_reply.started":"2024-02-27T17:24:00.526096Z","shell.execute_reply":"2024-02-27T17:24:00.543002Z"},"trusted":true},"execution_count":31,"outputs":[{"name":"stdout","text":"Missing Values Report:\n                    Column  Missing Values  Percentage Missing\nLotFrontage    LotFrontage             179           17.882118\nAgeGarageBlt  AgeGarageBlt              45            4.495504\nMasVnrArea      MasVnrArea               7            0.699301\nTotRmsAbvGrd  TotRmsAbvGrd               0            0.000000\nGarageArea      GarageArea               0            0.000000\nGarageCars      GarageCars               0            0.000000\nGarageFinish  GarageFinish               0            0.000000\nGarageType      GarageType               0            0.000000\nFireplaceQu    FireplaceQu               0            0.000000\nFireplaces      Fireplaces               0            0.000000\nFunctional      Functional               0            0.000000\nMSSubClass      MSSubClass               0            0.000000\nGarageQual      GarageQual               0            0.000000\nKitchenAbvGr  KitchenAbvGr               0            0.000000\nBedroomAbvGr  BedroomAbvGr               0            0.000000\n","output_type":"stream"}]},{"cell_type":"code","source":"discard_missing_col = missing_values_report[missing_values_report['Percentage Missing'] > 50].index.tolist()\n\nX_train_outprep = X_train_outprep.drop(discard_missing_col, axis=1)\n","metadata":{"execution":{"iopub.status.busy":"2024-02-27T17:24:09.041587Z","iopub.execute_input":"2024-02-27T17:24:09.042028Z","iopub.status.idle":"2024-02-27T17:24:09.049621Z","shell.execute_reply.started":"2024-02-27T17:24:09.042002Z","shell.execute_reply":"2024-02-27T17:24:09.048723Z"},"trusted":true},"execution_count":32,"outputs":[]},{"cell_type":"code","source":"import pandas as pd\nfrom sklearn.impute import KNNImputer\n\n# Create KNNImputer instance for 'income' column\nknn_imputer = KNNImputer()\n\n# Identify columns to impute\ncolumns_to_impute = ['LotFrontage', 'AgeGarageBlt', 'MasVnrArea']\n\n# Impute missing values in 'income' column\nX_train_outprep[columns_to_impute] = knn_imputer.fit_transform(X_train_outprep[columns_to_impute])","metadata":{"execution":{"iopub.status.busy":"2024-02-27T17:24:13.615766Z","iopub.execute_input":"2024-02-27T17:24:13.616248Z","iopub.status.idle":"2024-02-27T17:24:13.633706Z","shell.execute_reply.started":"2024-02-27T17:24:13.616222Z","shell.execute_reply":"2024-02-27T17:24:13.632903Z"},"trusted":true},"execution_count":33,"outputs":[]},{"cell_type":"markdown","source":"### Data Transformation- Discritization  \nIn certain continuous features exhibiting skewed distributions and value inflation, we employ the quantile method for discretization. For further insights on discretization techniques, please refer to the following link:  \nhttps://www.kaggle.com/code/zahrazolghadr/data-transformation","metadata":{}},{"cell_type":"code","source":"def frequency_table(variable):\n    \n    # Get unique elements and their counts\n    unique_elements, counts = np.unique(variable, return_counts=True)\n\n    # Calculate percentages\n    percentages = (counts / len(variable)) * 100\n\n    # Create a dictionary to store the value counts and percentages\n    value_counts_and_percentages = zip(unique_elements, counts, percentages)\n\n    # Print the value counts and percentages\n    for i, j, k in value_counts_and_percentages:\n        print(f\"{i}: Count: {j}, Percentage: {k:.2f}%\")\n    return","metadata":{"execution":{"iopub.status.busy":"2024-02-27T17:24:18.719241Z","iopub.execute_input":"2024-02-27T17:24:18.719670Z","iopub.status.idle":"2024-02-27T17:24:18.723970Z","shell.execute_reply.started":"2024-02-27T17:24:18.719648Z","shell.execute_reply":"2024-02-27T17:24:18.723192Z"},"trusted":true},"execution_count":34,"outputs":[]},{"cell_type":"code","source":"from sklearn.preprocessing import KBinsDiscretizer\n\n# Define a list of variables\nzero_inflated_list =['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', '2ndFlrSF', 'LowQualFinSF',\n                     'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',\n                     'ScreenPorch', 'PoolArea', 'MiscVal'] \n\n# Create an instance of KBinsDiscretizer\nkbin_discretizer = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='quantile')\n\n# Iterate through the variables in the list\nfor variable in zero_inflated_list:\n    # Fit and transform the current variable\n    X_train_outprep[f'{variable}_cat'] = kbin_discretizer.fit_transform(X_train_outprep[[variable]])\n    X_train_outprep[f'{variable}_cat'] = X_train_outprep[f'{variable}_cat'].astype('category')\n    \n    # Print the variable name and its bin edges\n    print(f'{variable}_cat bin edges:', kbin_discretizer.bin_edges_[0])\n    \n    # Print the frequency table\n    frequency_table(X_train_outprep[f'{variable}_cat'])\n    print(\"\\n\")","metadata":{"execution":{"iopub.status.busy":"2024-02-27T17:24:22.896271Z","iopub.execute_input":"2024-02-27T17:24:22.896576Z","iopub.status.idle":"2024-02-27T17:24:22.961791Z","shell.execute_reply.started":"2024-02-27T17:24:22.896553Z","shell.execute_reply":"2024-02-27T17:24:22.960861Z"},"trusted":true},"execution_count":35,"outputs":[{"name":"stdout","text":"MasVnrArea_cat bin edges: [   0.  158. 1600.]\n0.0: Count: 749, Percentage: 74.83%\n1.0: Count: 252, Percentage: 25.17%\n\n\nBsmtFinSF1_cat bin edges: [   0.  387.  731. 2096.]\n0.0: Count: 499, Percentage: 49.85%\n1.0: Count: 251, Percentage: 25.07%\n2.0: Count: 251, Percentage: 25.07%\n\n\nBsmtFinSF2_cat bin edges: [   0. 1474.]\n0.0: Count: 1001, Percentage: 100.00%\n\n\n2ndFlrSF_cat bin edges: [   0.  728. 2065.]\n0.0: Count: 750, Percentage: 74.93%\n1.0: Count: 251, Percentage: 25.07%\n\n\nLowQualFinSF_cat bin edges: [  0. 528.]\n0.0: Count: 1001, Percentage: 100.00%\n\n\nGarageArea_cat bin edges: [   0.  336.  480.  576. 1390.]\n0.0: Count: 249, Percentage: 24.88%\n1.0: Count: 238, Percentage: 23.78%\n2.0: Count: 226, Percentage: 22.58%\n3.0: Count: 288, Percentage: 28.77%\n\n\nWoodDeckSF_cat bin edges: [  0. 169. 857.]\n0.0: Count: 750, Percentage: 74.93%\n1.0: Count: 251, Percentage: 25.07%\n\n\nOpenPorchSF_cat bin edges: [  0.  27.  66. 502.]\n0.0: Count: 500, Percentage: 49.95%\n1.0: Count: 250, Percentage: 24.98%\n2.0: Count: 251, Percentage: 25.07%\n\n\nEnclosedPorch_cat bin edges: [  0. 301.]\n0.0: Count: 1001, Percentage: 100.00%\n\n\n3SsnPorch_cat bin edges: [  0. 508.]\n0.0: Count: 1001, Percentage: 100.00%\n\n\nScreenPorch_cat bin edges: [  0. 396.]\n0.0: Count: 1001, Percentage: 100.00%\n\n\nPoolArea_cat bin edges: [  0. 738.]\n0.0: Count: 1001, Percentage: 100.00%\n\n\nMiscVal_cat bin edges: [    0. 15500.]\n0.0: Count: 1001, Percentage: 100.00%\n\n\n<class 'pandas.core.frame.DataFrame'>\nIndex: 1001 entries, 708 to 132\nData columns (total 85 columns):\n #   Column             Non-Null Count  Dtype   \n---  ------             --------------  -----   \n 0   MSSubClass         1001 non-null   category\n 1   MSZoning           1001 non-null   object  \n 2   LotFrontage        1001 non-null   float64 \n 3   LotArea            1001 non-null   int64   \n 4   Alley              1001 non-null   object  \n 5   LotShape           1001 non-null   object  \n 6   LandContour        1001 non-null   object  \n 7   LotConfig          1001 non-null   object  \n 8   LandSlope          1001 non-null   object  \n 9   Neighborhood       1001 non-null   object  \n 10  Condition1         1001 non-null   object  \n 11  BldgType           1001 non-null   object  \n 12  HouseStyle         1001 non-null   object  \n 13  OverallQual        1001 non-null   int64   \n 14  OverallCond        1001 non-null   int64   \n 15  RoofStyle          1001 non-null   object  \n 16  Exterior1st        1001 non-null   object  \n 17  Exterior2nd        1001 non-null   object  \n 18  MasVnrType         1001 non-null   object  \n 19  MasVnrArea         1001 non-null   float64 \n 20  ExterQual          1001 non-null   object  \n 21  ExterCond          1001 non-null   object  \n 22  Foundation         1001 non-null   object  \n 23  BsmtQual           1001 non-null   object  \n 24  BsmtCond           1001 non-null   object  \n 25  BsmtExposure       1001 non-null   object  \n 26  BsmtFinType1       1001 non-null   object  \n 27  BsmtFinSF1         1001 non-null   int64   \n 28  BsmtFinType2       1001 non-null   object  \n 29  BsmtFinSF2         1001 non-null   int64   \n 30  BsmtUnfSF          1001 non-null   int64   \n 31  TotalBsmtSF        1001 non-null   int64   \n 32  HeatingQC          1001 non-null   object  \n 33  CentralAir         1001 non-null   object  \n 34  Electrical         1001 non-null   object  \n 35  1stFlrSF           1001 non-null   int64   \n 36  2ndFlrSF           1001 non-null   int64   \n 37  LowQualFinSF       1001 non-null   int64   \n 38  GrLivArea          1001 non-null   int64   \n 39  BsmtFullBath       1001 non-null   int64   \n 40  BsmtHalfBath       1001 non-null   int64   \n 41  FullBath           1001 non-null   int64   \n 42  HalfBath           1001 non-null   int64   \n 43  BedroomAbvGr       1001 non-null   int64   \n 44  KitchenAbvGr       1001 non-null   int64   \n 45  KitchenQual        1001 non-null   object  \n 46  TotRmsAbvGrd       1001 non-null   int64   \n 47  Functional         1001 non-null   object  \n 48  Fireplaces         1001 non-null   int64   \n 49  FireplaceQu        1001 non-null   object  \n 50  GarageType         1001 non-null   object  \n 51  GarageFinish       1001 non-null   object  \n 52  GarageCars         1001 non-null   int64   \n 53  GarageArea         1001 non-null   int64   \n 54  GarageQual         1001 non-null   object  \n 55  GarageCond         1001 non-null   object  \n 56  PavedDrive         1001 non-null   object  \n 57  WoodDeckSF         1001 non-null   int64   \n 58  OpenPorchSF        1001 non-null   int64   \n 59  EnclosedPorch      1001 non-null   int64   \n 60  3SsnPorch          1001 non-null   int64   \n 61  ScreenPorch        1001 non-null   int64   \n 62  PoolArea           1001 non-null   int64   \n 63  Fence              1001 non-null   object  \n 64  MiscVal            1001 non-null   int64   \n 65  MoSold             1001 non-null   int64   \n 66  SaleType           1001 non-null   object  \n 67  SaleCondition      1001 non-null   object  \n 68  AgeBuilt           1001 non-null   int64   \n 69  AgeRemodAdd        1001 non-null   int64   \n 70  AgeGarageBlt       1001 non-null   float64 \n 71  AgeSold            1001 non-null   int64   \n 72  MasVnrArea_cat     1001 non-null   category\n 73  BsmtFinSF1_cat     1001 non-null   category\n 74  BsmtFinSF2_cat     1001 non-null   category\n 75  2ndFlrSF_cat       1001 non-null   category\n 76  LowQualFinSF_cat   1001 non-null   category\n 77  GarageArea_cat     1001 non-null   category\n 78  WoodDeckSF_cat     1001 non-null   category\n 79  OpenPorchSF_cat    1001 non-null   category\n 80  EnclosedPorch_cat  1001 non-null   category\n 81  3SsnPorch_cat      1001 non-null   category\n 82  ScreenPorch_cat    1001 non-null   category\n 83  PoolArea_cat       1001 non-null   category\n 84  MiscVal_cat        1001 non-null   category\ndtypes: category(14), float64(3), int64(32), object(36)\nmemory usage: 578.5+ KB\n","output_type":"stream"},{"name":"stderr","text":"/opt/conda/lib/python3.10/site-packages/sklearn/preprocessing/_discretization.py:279: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 0 are removed. Consider decreasing the number of bins.\n  warnings.warn(\n/opt/conda/lib/python3.10/site-packages/sklearn/preprocessing/_discretization.py:279: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 0 are removed. Consider decreasing the number of bins.\n  warnings.warn(\n/opt/conda/lib/python3.10/site-packages/sklearn/preprocessing/_discretization.py:279: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 0 are removed. Consider decreasing the number of bins.\n  warnings.warn(\n/opt/conda/lib/python3.10/site-packages/sklearn/preprocessing/_discretization.py:279: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 0 are removed. Consider decreasing the number of bins.\n  warnings.warn(\n/opt/conda/lib/python3.10/site-packages/sklearn/preprocessing/_discretization.py:279: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 0 are removed. Consider decreasing the number of bins.\n  warnings.warn(\n/opt/conda/lib/python3.10/site-packages/sklearn/preprocessing/_discretization.py:279: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 0 are removed. Consider decreasing the number of bins.\n  warnings.warn(\n/opt/conda/lib/python3.10/site-packages/sklearn/preprocessing/_discretization.py:279: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 0 are removed. Consider decreasing the number of bins.\n  warnings.warn(\n/opt/conda/lib/python3.10/site-packages/sklearn/preprocessing/_discretization.py:279: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 0 are removed. Consider decreasing the number of bins.\n  warnings.warn(\n/opt/conda/lib/python3.10/site-packages/sklearn/preprocessing/_discretization.py:279: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 0 are removed. Consider decreasing the number of bins.\n  warnings.warn(\n/opt/conda/lib/python3.10/site-packages/sklearn/preprocessing/_discretization.py:279: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 0 are removed. Consider decreasing the number of bins.\n  warnings.warn(\n/opt/conda/lib/python3.10/site-packages/sklearn/preprocessing/_discretization.py:279: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 0 are removed. Consider decreasing the number of bins.\n  warnings.warn(\n/opt/conda/lib/python3.10/site-packages/sklearn/preprocessing/_discretization.py:279: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 0 are removed. Consider decreasing the number of bins.\n  warnings.warn(\n","output_type":"stream"}]},{"cell_type":"code","source":"drop_list = zero_inflated_list + ['BsmtFinSF2_cat', 'LowQualFinSF_cat', 'EnclosedPorch_cat',\n                                  '3SsnPorch_cat', 'ScreenPorch_cat', 'PoolArea_cat', 'MiscVal_cat']\n\nX_train_outprep = X_train_outprep.drop(drop_list, axis=1)","metadata":{"execution":{"iopub.status.busy":"2024-02-27T17:25:29.836058Z","iopub.execute_input":"2024-02-27T17:25:29.838101Z","iopub.status.idle":"2024-02-27T17:25:29.844932Z","shell.execute_reply.started":"2024-02-27T17:25:29.838073Z","shell.execute_reply":"2024-02-27T17:25:29.844238Z"},"trusted":true},"execution_count":36,"outputs":[]},{"cell_type":"markdown","source":"### Data Transformation- Normalization\nIn continuous features with skewed distributions but no inflation, we employ feature transformation to achieve a more normal distribution. For further learning about normalization techniques, please visit this link:  \nhttps://www.kaggle.com/code/zahrazolghadr/data-transformation","metadata":{}},{"cell_type":"code","source":"from sklearn.preprocessing import PowerTransformer\nimport matplotlib.pyplot as plt\n\ntransform_list = ['LotFrontage','LotArea', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea',\n                  'AgeBuilt', 'AgeRemodAdd', 'AgeGarageBlt'] \n\n\n# Iterate through selected features\nfor feature in transform_list:\n    # Check if the feature contains negative values\n    has_negative_values = (X_train_outprep[feature] <= 0).any()\n\n    # Choose the appropriate transformation method\n    if has_negative_values:\n        transformer = PowerTransformer(method='yeo-johnson', standardize=False)\n    else:\n        transformer = PowerTransformer(method='box-cox', standardize=False)\n\n    # Fit and transform the feature, and store the result in the new DataFrame\n    X_train_outprep[f\"{feature}_transformed\"] = transformer.fit_transform(X_train_outprep[[feature]])\n\n\n    # Get the lambda parameter used for transformation\n    lambda_value = transformer.lambdas_[0]\n    print(f\"Lambda for {feature}: {lambda_value}\")\n    \n    # Plot histograms for original and transformed features\n    plt.figure(figsize=(6, 3))\n\n    plt.subplot(1, 2, 1)\n    plt.hist(X_train_outprep[feature], bins=30, color='blue', alpha=0.7)\n    plt.title(f'Original {feature} Histogram')\n\n    plt.subplot(1, 2, 2)\n    plt.hist(X_train_outprep[f\"{feature}_transformed\"], bins=30, color='green', alpha=0.7)\n    plt.title(f'Transformed {feature} Histogram')\n\n    plt.tight_layout()\n    plt.show()\n\n\n# Display the transformed DataFrame\nprint('\\n')\nX_train_outprep","metadata":{"trusted":true},"execution_count":38,"outputs":[{"name":"stdout","text":"<class 'pandas.core.frame.DataFrame'>\nIndex: 1001 entries, 708 to 132\nData columns (total 74 columns):\n #   Column                    Non-Null Count  Dtype   \n---  ------                    --------------  -----   \n 0   MSSubClass                1001 non-null   category\n 1   MSZoning                  1001 non-null   object  \n 2   LotFrontage               1001 non-null   float64 \n 3   LotArea                   1001 non-null   int64   \n 4   Alley                     1001 non-null   object  \n 5   LotShape                  1001 non-null   object  \n 6   LandContour               1001 non-null   object  \n 7   LotConfig                 1001 non-null   object  \n 8   LandSlope                 1001 non-null   object  \n 9   Neighborhood              1001 non-null   object  \n 10  Condition1                1001 non-null   object  \n 11  BldgType                  1001 non-null   object  \n 12  HouseStyle                1001 non-null   object  \n 13  OverallQual               1001 non-null   int64   \n 14  OverallCond               1001 non-null   int64   \n 15  RoofStyle                 1001 non-null   object  \n 16  Exterior1st               1001 non-null   object  \n 17  Exterior2nd               1001 non-null   object  \n 18  MasVnrType                1001 non-null   object  \n 19  ExterQual                 1001 non-null   object  \n 20  ExterCond                 1001 non-null   object  \n 21  Foundation                1001 non-null   object  \n 22  BsmtQual                  1001 non-null   object  \n 23  BsmtCond                  1001 non-null   object  \n 24  BsmtExposure              1001 non-null   object  \n 25  BsmtFinType1              1001 non-null   object  \n 26  BsmtFinType2              1001 non-null   object  \n 27  BsmtUnfSF                 1001 non-null   int64   \n 28  TotalBsmtSF               1001 non-null   int64   \n 29  HeatingQC                 1001 non-null   object  \n 30  CentralAir                1001 non-null   object  \n 31  Electrical                1001 non-null   object  \n 32  1stFlrSF                  1001 non-null   int64   \n 33  GrLivArea                 1001 non-null   int64   \n 34  BsmtFullBath              1001 non-null   int64   \n 35  BsmtHalfBath              1001 non-null   int64   \n 36  FullBath                  1001 non-null   int64   \n 37  HalfBath                  1001 non-null   int64   \n 38  BedroomAbvGr              1001 non-null   int64   \n 39  KitchenAbvGr              1001 non-null   int64   \n 40  KitchenQual               1001 non-null   object  \n 41  TotRmsAbvGrd              1001 non-null   int64   \n 42  Functional                1001 non-null   object  \n 43  Fireplaces                1001 non-null   int64   \n 44  FireplaceQu               1001 non-null   object  \n 45  GarageType                1001 non-null   object  \n 46  GarageFinish              1001 non-null   object  \n 47  GarageCars                1001 non-null   int64   \n 48  GarageQual                1001 non-null   object  \n 49  GarageCond                1001 non-null   object  \n 50  PavedDrive                1001 non-null   object  \n 51  Fence                     1001 non-null   object  \n 52  MoSold                    1001 non-null   int64   \n 53  SaleType                  1001 non-null   object  \n 54  SaleCondition             1001 non-null   object  \n 55  AgeBuilt                  1001 non-null   int64   \n 56  AgeRemodAdd               1001 non-null   int64   \n 57  AgeGarageBlt              1001 non-null   float64 \n 58  AgeSold                   1001 non-null   int64   \n 59  MasVnrArea_cat            1001 non-null   category\n 60  BsmtFinSF1_cat            1001 non-null   category\n 61  2ndFlrSF_cat              1001 non-null   category\n 62  GarageArea_cat            1001 non-null   category\n 63  WoodDeckSF_cat            1001 non-null   category\n 64  OpenPorchSF_cat           1001 non-null   category\n 65  LotFrontage_transformed   1001 non-null   float64 \n 66  LotArea_transformed       1001 non-null   float64 \n 67  BsmtUnfSF_transformed     1001 non-null   float64 \n 68  TotalBsmtSF_transformed   1001 non-null   float64 \n 69  1stFlrSF_transformed      1001 non-null   float64 \n 70  GrLivArea_transformed     1001 non-null   float64 \n 71  AgeBuilt_transformed      1001 non-null   float64 \n 72  AgeRemodAdd_transformed   1001 non-null   float64 \n 73  AgeGarageBlt_transformed  1001 non-null   float64 \ndtypes: category(7), float64(11), int64(20), object(36)\nmemory usage: 539.6+ KB\n","output_type":"stream"}]},{"cell_type":"code","source":"X_train_outprep = X_train_outprep.drop(transform_list, axis=1)\n\ncategorical = X_train_outprep.select_dtypes(include=['object','category']).columns.tolist()\ncontinuous = X_train_outprep.select_dtypes(exclude=['object','category']).columns.tolist()\n\nlen(categorical), len(continuous)","metadata":{"execution":{"iopub.status.busy":"2024-02-27T17:33:49.186092Z","iopub.execute_input":"2024-02-27T17:33:49.186427Z","iopub.status.idle":"2024-02-27T17:33:49.199372Z","shell.execute_reply.started":"2024-02-27T17:33:49.186405Z","shell.execute_reply":"2024-02-27T17:33:49.198359Z"},"trusted":true},"execution_count":39,"outputs":[{"execution_count":39,"output_type":"execute_result","data":{"text/plain":"(43, 22)"},"metadata":{}}]}]}