{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[{"sourceId":7273365,"sourceType":"datasetVersion","datasetId":4216596}],"dockerImageVersionId":30626,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"After identifying cases with poor quality during the **Exploratory Data Analysis (EDA)** step, the data cleaning stage addresses and rectifies these issues.  \n\n**Data cleaning** plays a fundamental role in enhancing the overall **quality** and **reliability** of datasets, directly impacting the performance of **machine learning models** and **analytical outcomes**. A well-cleaned dataset serves as the bedrock for **robust** model training and **accurate predictions**. \nIn this notebook, our primary focus is on executing the initial stage of data cleaning, including:\n* Feature screening\n* Handling values that fall outside the logical range of respective fields\n* Addressing inconsistencies within the dataset  \n\nThis meticulous approach to data cleaning not only enhances the accuracy of our analyses but also contributes significantly to the **overall success** of data science projects.","metadata":{}},{"cell_type":"markdown","source":"### Read the dataset","metadata":{}},{"cell_type":"code","source":"import pandas as pd\ndf = pd.read_csv('/kaggle/input/bank-loan/Bankloan.txt')","metadata":{"trusted":true},"execution_count":8,"outputs":[{"execution_count":8,"output_type":"execute_result","data":{"text/plain":"      age   ed  employ  address  income  debtinc   creddebt   othdebt default\n0    41.0  3.0      17       12   176.0      9.3  11.359392  5.008608       1\n1    27.0  1.0      10        6    31.0     17.3   1.362202  4.000798       0\n2    40.0  1.0      15        7     NaN      5.5   0.856075  2.168925       0\n3    41.0  NaN      15       14   120.0      2.9   2.658720  0.821280       0\n4    24.0  2.0       2        0    28.0     17.3   1.787436  3.056564       1\n..    ...  ...     ...      ...     ...      ...        ...       ...     ...\n695  36.0  2.0       6       15    27.0      4.6   0.262062  0.979938       1\n696  29.0  2.0       6        4    21.0     11.5   0.369495  2.045505       0\n697  33.0  1.0      15        3    32.0      7.6   0.491264  1.940736       0\n698  45.0  1.0      19       22    77.0      8.4   2.302608  4.165392       0\n699  37.0  1.0      12       14     NaN     14.7   2.994684  3.473316       0\n\n[700 rows x 9 columns]","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>age</th>\n      <th>ed</th>\n      <th>employ</th>\n      <th>address</th>\n      <th>income</th>\n      <th>debtinc</th>\n      <th>creddebt</th>\n      <th>othdebt</th>\n      <th>default</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>41.0</td>\n      <td>3.0</td>\n      <td>17</td>\n      <td>12</td>\n      <td>176.0</td>\n      <td>9.3</td>\n      <td>11.359392</td>\n      <td>5.008608</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>27.0</td>\n      <td>1.0</td>\n      <td>10</td>\n      <td>6</td>\n      <td>31.0</td>\n      <td>17.3</td>\n      <td>1.362202</td>\n      <td>4.000798</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>40.0</td>\n      <td>1.0</td>\n      <td>15</td>\n      <td>7</td>\n      <td>NaN</td>\n      <td>5.5</td>\n      <td>0.856075</td>\n      <td>2.168925</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>41.0</td>\n      <td>NaN</td>\n      <td>15</td>\n      <td>14</td>\n      <td>120.0</td>\n      <td>2.9</td>\n      <td>2.658720</td>\n      <td>0.821280</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>24.0</td>\n      <td>2.0</td>\n      <td>2</td>\n      <td>0</td>\n      <td>28.0</td>\n      <td>17.3</td>\n      <td>1.787436</td>\n      <td>3.056564</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>695</th>\n      <td>36.0</td>\n      <td>2.0</td>\n      <td>6</td>\n      <td>15</td>\n      <td>27.0</td>\n      <td>4.6</td>\n      <td>0.262062</td>\n      <td>0.979938</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>696</th>\n      <td>29.0</td>\n      <td>2.0</td>\n      <td>6</td>\n      <td>4</td>\n      <td>21.0</td>\n      <td>11.5</td>\n      <td>0.369495</td>\n      <td>2.045505</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>697</th>\n      <td>33.0</td>\n      <td>1.0</td>\n      <td>15</td>\n      <td>3</td>\n      <td>32.0</td>\n      <td>7.6</td>\n      <td>0.491264</td>\n      <td>1.940736</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>698</th>\n      <td>45.0</td>\n      <td>1.0</td>\n      <td>19</td>\n      <td>22</td>\n      <td>77.0</td>\n      <td>8.4</td>\n      <td>2.302608</td>\n      <td>4.165392</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>699</th>\n      <td>37.0</td>\n      <td>1.0</td>\n      <td>12</td>\n      <td>14</td>\n      <td>NaN</td>\n      <td>14.7</td>\n      <td>2.994684</td>\n      <td>3.473316</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n<p>700 rows × 9 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"markdown","source":"To enhance clarity and facilitate streamlined data processing, we **separate the dataset** into two distinct dataframes: one designated for the **target variable** or response, and the other for the **input variables** or predictors. This segregation allows for a more organized and efficient approach in preparing the data for subsequent modeling and analysis.","metadata":{}},{"cell_type":"code","source":"target = df.iloc[:,-1]\ninputs = df.iloc[:,0:-1]","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"Create two lists distinguishing **categorical** and **continuous** variables based on their respective indices.","metadata":{}},{"cell_type":"code","source":"columns = inputs.columns\n\n# Choose categorical elements \ncategorical_indices = [1]\n\n# Use a list comprehension to select the elements at the specified indices\ncategorical_fields = [columns[i] for i in categorical_indices]\n\n# Create a new list of columns excluding categorical_fields (continuous)\ncontinuous_fields = [j for j in columns if j not in categorical_fields]","metadata":{"trusted":true},"execution_count":14,"outputs":[{"execution_count":14,"output_type":"execute_result","data":{"text/plain":"['age', 'employ', 'address', 'income', 'debtinc', 'creddebt', 'othdebt']"},"metadata":{}}]},{"cell_type":"markdown","source":"### Feature Screening \nFilter out these features:  \n\n* **Features with a coefficient of variation less than 0.1 for continuous variables**  \nIdentifying and screening out **continuous variables** with low variability ensures that the selected features provide **meaningful information** for analysis and modeling.\n\n* **Features where the mode category percentage is greater than 95% for categorical variables**  \nThis step focuses on retaining **categorical variables** where one category overwhelmingly dominates, helping to streamline the dataset and enhance the interpretability of the resulting models.\n\n* **Features with a percentage of unique categories exceeding 90% for categorical variables**  \nScreening out **categorical variables** with a high percentage of unique categories contributes to simplifying the dataset and mitigating the risk of overfitting, ensuring a more robust and generalizable model.","metadata":{}},{"cell_type":"code","source":"# Define a minimum value for coefficient of variation\nmin_cv = 0.1\n\n# Calculate the coefficient of variation for each column\ncv_values = inputs[continuous_fields].std() / inputs[continuous_fields].mean()\n\n# Filter out columns with CV less than 0.1\nselected_columns =  cv_values[cv_values < 0.1].index\n\n# Create a new DataFrame with only the selected columns\nfiltered_con = inputs[selected_columns]\n\n# Print the resulting DataFrame\ninputs_con = inputs[continuous_fields].drop(selected_columns, axis=1)\nprint(inputs_con)","metadata":{"trusted":true},"execution_count":23,"outputs":[{"execution_count":23,"output_type":"execute_result","data":{"text/plain":"      age  employ  address  income  debtinc   creddebt   othdebt\n0    41.0      17       12   176.0      9.3  11.359392  5.008608\n1    27.0      10        6    31.0     17.3   1.362202  4.000798\n2    40.0      15        7     NaN      5.5   0.856075  2.168925\n3    41.0      15       14   120.0      2.9   2.658720  0.821280\n4    24.0       2        0    28.0     17.3   1.787436  3.056564\n..    ...     ...      ...     ...      ...        ...       ...\n695  36.0       6       15    27.0      4.6   0.262062  0.979938\n696  29.0       6        4    21.0     11.5   0.369495  2.045505\n697  33.0      15        3    32.0      7.6   0.491264  1.940736\n698  45.0      19       22    77.0      8.4   2.302608  4.165392\n699  37.0      12       14     NaN     14.7   2.994684  3.473316\n\n[700 rows x 7 columns]","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>age</th>\n      <th>employ</th>\n      <th>address</th>\n      <th>income</th>\n      <th>debtinc</th>\n      <th>creddebt</th>\n      <th>othdebt</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>41.0</td>\n      <td>17</td>\n      <td>12</td>\n      <td>176.0</td>\n      <td>9.3</td>\n      <td>11.359392</td>\n      <td>5.008608</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>27.0</td>\n      <td>10</td>\n      <td>6</td>\n      <td>31.0</td>\n      <td>17.3</td>\n      <td>1.362202</td>\n      <td>4.000798</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>40.0</td>\n      <td>15</td>\n      <td>7</td>\n      <td>NaN</td>\n      <td>5.5</td>\n      <td>0.856075</td>\n      <td>2.168925</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>41.0</td>\n      <td>15</td>\n      <td>14</td>\n      <td>120.0</td>\n      <td>2.9</td>\n      <td>2.658720</td>\n      <td>0.821280</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>24.0</td>\n      <td>2</td>\n      <td>0</td>\n      <td>28.0</td>\n      <td>17.3</td>\n      <td>1.787436</td>\n      <td>3.056564</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>695</th>\n      <td>36.0</td>\n      <td>6</td>\n      <td>15</td>\n      <td>27.0</td>\n      <td>4.6</td>\n      <td>0.262062</td>\n      <td>0.979938</td>\n    </tr>\n    <tr>\n      <th>696</th>\n      <td>29.0</td>\n      <td>6</td>\n      <td>4</td>\n      <td>21.0</td>\n      <td>11.5</td>\n      <td>0.369495</td>\n      <td>2.045505</td>\n    </tr>\n    <tr>\n      <th>697</th>\n      <td>33.0</td>\n      <td>15</td>\n      <td>3</td>\n      <td>32.0</td>\n      <td>7.6</td>\n      <td>0.491264</td>\n      <td>1.940736</td>\n    </tr>\n    <tr>\n      <th>698</th>\n      <td>45.0</td>\n      <td>19</td>\n      <td>22</td>\n      <td>77.0</td>\n      <td>8.4</td>\n      <td>2.302608</td>\n      <td>4.165392</td>\n    </tr>\n    <tr>\n      <th>699</th>\n      <td>37.0</td>\n      <td>12</td>\n      <td>14</td>\n      <td>NaN</td>\n      <td>14.7</td>\n      <td>2.994684</td>\n      <td>3.473316</td>\n    </tr>\n  </tbody>\n</table>\n<p>700 rows × 7 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"import pandas as pd\n\n# Define a threshold for the dominant category percentage\nthreshold = 95\n\n# Calculate the percentage of the mode category for each column\nmode_category = (inputs[categorical_fields].apply(lambda x: x.value_counts().max() / len(x)) * 100)\n\n# Select columns where the mode category percentage is greater than the threshold\nselected_categorical_columns = mode_category[mode_category > threshold].index\n\n# Create a new DataFrame with only the selected columns\nmode_filtered_inputs = inputs[selected_categorical_columns]\n\n# Filter out selected columns and print the resulting DataFrame\ninputs_cat = inputs[categorical_fields].drop(selected_categorical_columns, axis=1)\nprint(inputs_cat)","metadata":{"trusted":true},"execution_count":34,"outputs":[{"execution_count":34,"output_type":"execute_result","data":{"text/plain":"      ed\n0    3.0\n1    1.0\n2    1.0\n3    NaN\n4    2.0\n..   ...\n695  2.0\n696  2.0\n697  1.0\n698  1.0\n699  1.0\n\n[700 rows x 1 columns]","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>ed</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>3.0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>2.0</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>695</th>\n      <td>2.0</td>\n    </tr>\n    <tr>\n      <th>696</th>\n      <td>2.0</td>\n    </tr>\n    <tr>\n      <th>697</th>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>698</th>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>699</th>\n      <td>1.0</td>\n    </tr>\n  </tbody>\n</table>\n<p>700 rows × 1 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"import pandas as pd\n\n# Set a threshold for excluding columns \nthreshold = 90\n\n# Calculate the percentage of distinct categories in categorical variables\ndistinct_percentage = (inputs_cat[categorical_fields].apply(lambda x: x.dropna().nunique() / x.count()) * 100)\n\n# Select categorical columns based on distinct percentage threshold\nselected_categorical_columns = distinct_percentage[distinct_percentage > threshold].index\n\n# Create a new DataFrame with only the selected columns\ndistinct_filtered_inputs = inputs_cat[selected_categorical_columns]\n\n# Filter out selected columns and print the resulting DataFrame\ninputs_cat = inputs_cat.drop(selected_categorical_columns, axis=1)\nprint(inputs_cat)\n","metadata":{"trusted":true},"execution_count":43,"outputs":[{"execution_count":43,"output_type":"execute_result","data":{"text/plain":"      ed\n0    3.0\n1    1.0\n2    1.0\n3    NaN\n4    2.0\n..   ...\n695  2.0\n696  2.0\n697  1.0\n698  1.0\n699  1.0\n\n[700 rows x 1 columns]","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>ed</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>3.0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>2.0</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>695</th>\n      <td>2.0</td>\n    </tr>\n    <tr>\n      <th>696</th>\n      <td>2.0</td>\n    </tr>\n    <tr>\n      <th>697</th>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>698</th>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>699</th>\n      <td>1.0</td>\n    </tr>\n  </tbody>\n</table>\n<p>700 rows × 1 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"markdown","source":"Create a new dataframe by excluding both continuous and categorical features through feature screening.","metadata":{}},{"cell_type":"code","source":"filtered_df = pd.concat([inputs_con, inputs_cat, target], axis=1)","metadata":{"trusted":true},"execution_count":45,"outputs":[{"execution_count":45,"output_type":"execute_result","data":{"text/plain":"      age  employ  address  income  debtinc   creddebt   othdebt   ed default\n0    41.0      17       12   176.0      9.3  11.359392  5.008608  3.0       1\n1    27.0      10        6    31.0     17.3   1.362202  4.000798  1.0       0\n2    40.0      15        7     NaN      5.5   0.856075  2.168925  1.0       0\n3    41.0      15       14   120.0      2.9   2.658720  0.821280  NaN       0\n4    24.0       2        0    28.0     17.3   1.787436  3.056564  2.0       1\n..    ...     ...      ...     ...      ...        ...       ...  ...     ...\n695  36.0       6       15    27.0      4.6   0.262062  0.979938  2.0       1\n696  29.0       6        4    21.0     11.5   0.369495  2.045505  2.0       0\n697  33.0      15        3    32.0      7.6   0.491264  1.940736  1.0       0\n698  45.0      19       22    77.0      8.4   2.302608  4.165392  1.0       0\n699  37.0      12       14     NaN     14.7   2.994684  3.473316  1.0       0\n\n[700 rows x 9 columns]","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>age</th>\n      <th>employ</th>\n      <th>address</th>\n      <th>income</th>\n      <th>debtinc</th>\n      <th>creddebt</th>\n      <th>othdebt</th>\n      <th>ed</th>\n      <th>default</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>41.0</td>\n      <td>17</td>\n      <td>12</td>\n      <td>176.0</td>\n      <td>9.3</td>\n      <td>11.359392</td>\n      <td>5.008608</td>\n      <td>3.0</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>27.0</td>\n      <td>10</td>\n      <td>6</td>\n      <td>31.0</td>\n      <td>17.3</td>\n      <td>1.362202</td>\n      <td>4.000798</td>\n      <td>1.0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>40.0</td>\n      <td>15</td>\n      <td>7</td>\n      <td>NaN</td>\n      <td>5.5</td>\n      <td>0.856075</td>\n      <td>2.168925</td>\n      <td>1.0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>41.0</td>\n      <td>15</td>\n      <td>14</td>\n      <td>120.0</td>\n      <td>2.9</td>\n      <td>2.658720</td>\n      <td>0.821280</td>\n      <td>NaN</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>24.0</td>\n      <td>2</td>\n      <td>0</td>\n      <td>28.0</td>\n      <td>17.3</td>\n      <td>1.787436</td>\n      <td>3.056564</td>\n      <td>2.0</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>695</th>\n      <td>36.0</td>\n      <td>6</td>\n      <td>15</td>\n      <td>27.0</td>\n      <td>4.6</td>\n      <td>0.262062</td>\n      <td>0.979938</td>\n      <td>2.0</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>696</th>\n      <td>29.0</td>\n      <td>6</td>\n      <td>4</td>\n      <td>21.0</td>\n      <td>11.5</td>\n      <td>0.369495</td>\n      <td>2.045505</td>\n      <td>2.0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>697</th>\n      <td>33.0</td>\n      <td>15</td>\n      <td>3</td>\n      <td>32.0</td>\n      <td>7.6</td>\n      <td>0.491264</td>\n      <td>1.940736</td>\n      <td>1.0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>698</th>\n      <td>45.0</td>\n      <td>19</td>\n      <td>22</td>\n      <td>77.0</td>\n      <td>8.4</td>\n      <td>2.302608</td>\n      <td>4.165392</td>\n      <td>1.0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>699</th>\n      <td>37.0</td>\n      <td>12</td>\n      <td>14</td>\n      <td>NaN</td>\n      <td>14.7</td>\n      <td>2.994684</td>\n      <td>3.473316</td>\n      <td>1.0</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n<p>700 rows × 9 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"markdown","source":"###  Handling Values Outside the Logical Range  \nIn data analysis, handling values that fall outside the logical range of respective fields is a critical step to maintain the **integrity** and **reliability** of the dataset. Values significantly deviating from the expected range, can distort analytical results and impact the overall **quality of findings**.Whether in continuous fields, addressing values beyond the logical range ensures that subsequent modeling or statistical techniques are based on a more representative and **trustworthy** dataset.","metadata":{}},{"cell_type":"code","source":"import pandas as pd\n\n# Define ranges for each column\ncolumn_ranges = {\n    'age': (18, 70),\n    'employ': (0, 31),\n    'address': (0, 80),\n    'income': (0, 1000),\n    'debtinc': (0, 100),\n    'creddebt': (0, 30),\n    'othdebt': (0, 30)\n}\n\n# Iterate through each column and fill NaN values outside the defined range\nfor column, (min_val, max_val) in column_ranges.items():\n    filtered_df[column] = filtered_df[column].apply(lambda x: x if min_val <= x <= max_val else None)\n\n# Display the updated DataFrame\nprint(filtered_df)\nfiltered_df.describe()\nfiltered_df.info()","metadata":{"trusted":true},"execution_count":51,"outputs":[{"name":"stdout","text":"<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 700 entries, 0 to 699\nData columns (total 9 columns):\n #   Column    Non-Null Count  Dtype  \n---  ------    --------------  -----  \n 0   age       680 non-null    float64\n 1   employ    700 non-null    int64  \n 2   address   700 non-null    int64  \n 3   income    663 non-null    float64\n 4   debtinc   700 non-null    float64\n 5   creddebt  700 non-null    float64\n 6   othdebt   700 non-null    float64\n 7   ed        680 non-null    float64\n 8   default   700 non-null    object \ndtypes: float64(6), int64(2), object(1)\nmemory usage: 49.3+ KB\n","output_type":"stream"}]},{"cell_type":"markdown","source":"### Handling Inconsistent Data  \nIn the area of data analysis, addressing inconsistent data is a basic task to ensure the **reliability** of results. Inconsistent data in **categorical variables**, whether due to **data entry errors** or discrepancies in **data Integration**, can introduce noise and **inaccuracies** into the dataset, potentially leading to misleading findings. For instance, one employee may enter customer addresses as \"block 1/23\", while another may use \"block 1-23\".  \nBy handling and rectifying these inconsistencies, analysts can foster a more cohesive and accurate representation of the underlying information. The impact of such attention to detail extends beyond cleaning the dataset; it directly influences the **credibility** of analysis reports. A meticulously curated dataset, free from inconsistencies in codes, lays the groundwork for **robust** statistical analyses and more informed decision-making.","metadata":{}},{"cell_type":"markdown","source":"First detect inconsistent data in frequency tables.","metadata":{}},{"cell_type":"code","source":"import numpy as np\n\ndef frequency_table(variable):\n    \n    # Get unique elements and their counts\n    unique_elements, counts = np.unique(variable.dropna(), return_counts=True)\n\n    # Calculate percentages\n    percentages = (counts / len(variable)) * 100\n\n    # Create a dictionary to store the value counts and percentages\n    value_counts_and_percentages = zip(unique_elements, counts, percentages)\n\n    # Print the value counts and percentages\n    for i, j, k in value_counts_and_percentages:\n        print(f\"{i}: Count: {j}, Percentage: {k:.2f}%\")\n    return\n\n\nfrequency_table(filtered_df['default'])","metadata":{"execution":{"iopub.status.busy":"2024-01-16T13:19:57.027793Z","iopub.execute_input":"2024-01-16T13:19:57.028109Z","iopub.status.idle":"2024-01-16T13:19:57.036233Z","shell.execute_reply.started":"2024-01-16T13:19:57.028084Z","shell.execute_reply":"2024-01-16T13:19:57.034902Z"},"trusted":true},"execution_count":52,"outputs":[{"name":"stdout","text":"'0': Count: 1, Percentage: 0.14%\n0: Count: 515, Percentage: 73.57%\n1: Count: 183, Percentage: 26.14%\n:0: Count: 1, Percentage: 0.14%\n","output_type":"stream"}]},{"cell_type":"markdown","source":"After detecting inconsistent data in the frequency tables, the logical next step is to replace the incorrect data with the correct ones.","metadata":{}},{"cell_type":"code","source":"filtered_df['default'] = filtered_df['default'].replace([':0', \"'0'\"], '0')","metadata":{"execution":{"iopub.status.busy":"2024-01-16T13:21:22.259172Z","iopub.execute_input":"2024-01-16T13:21:22.259503Z","iopub.status.idle":"2024-01-16T13:21:22.265865Z","shell.execute_reply.started":"2024-01-16T13:21:22.259469Z","shell.execute_reply":"2024-01-16T13:21:22.264681Z"},"trusted":true},"execution_count":53,"outputs":[]},{"cell_type":"markdown","source":"Check the frequency table to ensure data consistency.","metadata":{}},{"cell_type":"code","source":"frequency_table(filtered_df['default'])","metadata":{"execution":{"iopub.status.busy":"2024-01-16T13:21:28.883736Z","iopub.execute_input":"2024-01-16T13:21:28.884060Z","iopub.status.idle":"2024-01-16T13:21:28.890124Z","shell.execute_reply.started":"2024-01-16T13:21:28.884035Z","shell.execute_reply":"2024-01-16T13:21:28.889172Z"},"trusted":true},"execution_count":54,"outputs":[{"name":"stdout","text":"0: Count: 517, Percentage: 73.86%\n1: Count: 183, Percentage: 26.14%\n","output_type":"stream"}]},{"cell_type":"code","source":"filtered_df.to_csv('/kaggle/working/Bankloan_Cleanedv1.csv')","metadata":{"trusted":true},"execution_count":null,"outputs":[]}]}