diff --git a/.gitignore b/.gitignore
index a9783e2..7f3c84d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
concrete/*
+data/*
diff --git a/README.md b/README.md
index 1bee5de..bbf76f6 100644
--- a/README.md
+++ b/README.md
@@ -52,6 +52,11 @@ pip install src/requirements.txt
```
3. Execute the notebooks in the specified order, ensuring that the dataset and necessary files are correctly referenced.
+
+## Dask as the Runtime Engine
+
+In the data processing step, we have incorporated **Dask**, a parallel computing library, to handle large-scale data efficiently. With Dask, we can process data in a distributed manner, allowing us to scale our computation to multiple cores and machines seamlessly. The use of Dask enables us to leverage the power of parallel processing, making our data processing pipelines faster and more scalable.
+
Feel free to experiment with different regression algorithms and hyperparameter tuning to further enhance the model performance. Share your feedback and contribute to this project to help us improve and expand its capabilities.
## MLflow Integration
@@ -71,4 +76,4 @@ For inquiries or further information, please contact me at:
- LinkedIn: https://www.linkedin.com/in/pedro-a-d-s/
## License
-This project is licensed under the MIT License.
\ No newline at end of file
+This project is licensed under the MIT License.
diff --git a/data/1-bronze/Concrete_Data_Cleaned.parquet b/data/1-bronze/Concrete_Data_Cleaned.parquet
index e50a3d7..b90636f 100644
Binary files a/data/1-bronze/Concrete_Data_Cleaned.parquet and b/data/1-bronze/Concrete_Data_Cleaned.parquet differ
diff --git a/notebooks/01-EDA/01_EDA_strength_prediction.ipynb b/notebooks/01-EDA/01_EDA_strength_prediction.ipynb
index 2cd7176..4606aed 100644
--- a/notebooks/01-EDA/01_EDA_strength_prediction.ipynb
+++ b/notebooks/01-EDA/01_EDA_strength_prediction.ipynb
@@ -81,11 +81,9 @@
"metadata": {},
"outputs": [],
"source": [
- "import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
- "import itertools\n",
"from scipy import stats\n",
"%matplotlib inline\n",
"sns.set_style(\"white\")"
@@ -184,7 +182,7 @@
"metadata": {},
"outputs": [],
"source": [
- "df = pd.read_csv('../../data/0. raw-data/Concrete_Data.csv')"
+ "df = pd.read_csv('../../data/0-raw-data/Concrete_Data.csv')"
]
},
{
@@ -2567,13 +2565,6 @@
"source": [
"df.to_parquet('../../data/2-silver/Concrete_Data_Cleaned.parquet', index = False)"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
diff --git a/notebooks/02-ML Modelling/01_ml_experiments.ipynb b/notebooks/02-ML Modelling/01_ml_experiments.ipynb
index d77df77..9807c91 100644
--- a/notebooks/02-ML Modelling/01_ml_experiments.ipynb
+++ b/notebooks/02-ML Modelling/01_ml_experiments.ipynb
@@ -32,22 +32,16 @@
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
- "import xgboost as xgb\n",
"import mlflow\n",
"import mlflow.sklearn\n",
- "%matplotlib inline\n",
- "sns.set_style(\"white\")\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
- "from sklearn.linear_model import LinearRegression, Ridge, Lasso\n",
"from sklearn.neighbors import KNeighborsRegressor\n",
"from sklearn.tree import DecisionTreeRegressor\n",
"from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor\n",
"from sklearn.svm import SVR\n",
"from sklearn import metrics\n",
- "from scipy import stats\n",
"from scipy.stats import zscore\n",
- "from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score\n",
"from sklearn.model_selection import KFold\n",
"from sklearn.model_selection import cross_val_score\n",
"from xgboost.sklearn import XGBRegressor\n",
@@ -64,7 +58,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -73,13 +67,15 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 3,
"metadata": {
"id": "tp7uDcZaMqku"
},
"outputs": [],
"source": [
- "warnings.filterwarnings(\"ignore\")"
+ "warnings.filterwarnings(\"ignore\")\n",
+ "%matplotlib inline\n",
+ "sns.set_style(\"white\")"
]
},
{
@@ -93,13 +89,13 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 7,
"metadata": {
"id": "eBiHe1RdMqku"
},
"outputs": [],
"source": [
- "df = pd.read_parquet('../../data/2-silver/Concrete_Data_Cleaned.parquet')"
+ "df = pd.read_parquet('./data/1-bronze/Concrete_Data_Cleaned.parquet')"
]
},
{
@@ -113,7 +109,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 8,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
@@ -165,8 +161,8 @@
"
2.5 | \n",
" 1040.0 | \n",
" 676.0 | \n",
- " 28 | \n",
- " 79.99 | \n",
+ " 28.0 | \n",
+ " 79,99 | \n",
" \n",
" \n",
" 1 | \n",
@@ -177,8 +173,8 @@
" 2.5 | \n",
" 1055.0 | \n",
" 676.0 | \n",
- " 28 | \n",
- " 61.89 | \n",
+ " 28.0 | \n",
+ " 61,89 | \n",
"
\n",
" \n",
" 2 | \n",
@@ -189,8 +185,8 @@
" 0.0 | \n",
" 932.0 | \n",
" 594.0 | \n",
- " 28 | \n",
- " 40.27 | \n",
+ " 28.0 | \n",
+ " 40,27 | \n",
"
\n",
" \n",
" 3 | \n",
@@ -201,8 +197,8 @@
" 0.0 | \n",
" 932.0 | \n",
" 594.0 | \n",
- " 28 | \n",
- " 41.05 | \n",
+ " 28.0 | \n",
+ " 41,05 | \n",
"
\n",
" \n",
" 4 | \n",
@@ -213,23 +209,23 @@
" 0.0 | \n",
" 978.4 | \n",
" 825.5 | \n",
- " 28 | \n",
- " 44.30 | \n",
+ " 28.0 | \n",
+ " 44,30 | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " cement slag ash water superplastic coarseagg fineagg age strength\n",
- "0 272.9 0.0 0.0 162.0 2.5 1040.0 676.0 28 79.99\n",
- "1 272.9 0.0 0.0 162.0 2.5 1055.0 676.0 28 61.89\n",
- "2 332.5 142.5 0.0 185.0 0.0 932.0 594.0 28 40.27\n",
- "3 332.5 142.5 0.0 185.0 0.0 932.0 594.0 28 41.05\n",
- "4 198.6 132.4 0.0 192.0 0.0 978.4 825.5 28 44.30"
+ " cement slag ash water superplastic coarseagg fineagg age strength\n",
+ "0 272.9 0.0 0.0 162.0 2.5 1040.0 676.0 28.0 79,99 \n",
+ "1 272.9 0.0 0.0 162.0 2.5 1055.0 676.0 28.0 61,89 \n",
+ "2 332.5 142.5 0.0 185.0 0.0 932.0 594.0 28.0 40,27 \n",
+ "3 332.5 142.5 0.0 185.0 0.0 932.0 594.0 28.0 41,05 \n",
+ "4 198.6 132.4 0.0 192.0 0.0 978.4 825.5 28.0 44,30 "
]
},
- "execution_count": 5,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -262,7 +258,7 @@
"data": {
"text/plain": [
"Index(['cement', 'slag', 'ash', 'water', 'superplastic', 'coarseagg',\n",
- " 'fineagg', 'age', 'strength'],\n",
+ " 'fineagg', 'age', 'Concrete compressive strength(MPa, megapascals) '],\n",
" dtype='object')"
]
},
@@ -277,7 +273,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": null,
"metadata": {
"id": "7rkJekAZMqkv"
},
@@ -292,7 +288,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -311,7 +307,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": null,
"metadata": {
"id": "WFW8u_C8Mqkw"
},
@@ -323,7 +319,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": null,
"metadata": {
"id": "MmXTIBBdMqkw"
},
@@ -334,7 +330,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
@@ -343,200 +339,7 @@
"id": "Y0xZGBALMqkw",
"outputId": "f6f38de1-59b9-439d-e499-dedcf46041ec"
},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " cement | \n",
- " slag | \n",
- " ash | \n",
- " water | \n",
- " superplastic | \n",
- " coarseagg | \n",
- " fineagg | \n",
- " age | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 0.147951 | \n",
- " -0.830600 | \n",
- " -0.822730 | \n",
- " -1.004275 | \n",
- " -0.618977 | \n",
- " 1.262346 | \n",
- " -1.282281 | \n",
- " 0.300203 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 0.147951 | \n",
- " -0.830600 | \n",
- " -0.822730 | \n",
- " -1.004275 | \n",
- " -0.618977 | \n",
- " 1.496911 | \n",
- " -1.282281 | \n",
- " 0.300203 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 0.892690 | \n",
- " 1.130849 | \n",
- " -0.822730 | \n",
- " 0.396167 | \n",
- " -1.156464 | \n",
- " -0.426523 | \n",
- " -2.543096 | \n",
- " 0.300203 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 0.892690 | \n",
- " 1.130849 | \n",
- " -0.822730 | \n",
- " 0.396167 | \n",
- " -1.156464 | \n",
- " -0.426523 | \n",
- " -2.543096 | \n",
- " 0.300203 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " -0.780475 | \n",
- " 0.991827 | \n",
- " -0.822730 | \n",
- " 0.822389 | \n",
- " -1.156464 | \n",
- " 0.299065 | \n",
- " 1.016402 | \n",
- " 0.300203 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 1025 | \n",
- " 0.191685 | \n",
- " 0.766088 | \n",
- " 0.642499 | \n",
- " 0.067368 | \n",
- " 0.756991 | \n",
- " -1.394495 | \n",
- " 0.136906 | \n",
- " 0.300203 | \n",
- "
\n",
- " \n",
- " 1026 | \n",
- " 0.763985 | \n",
- " -0.830600 | \n",
- " 1.053023 | \n",
- " 1.065944 | \n",
- " 1.079483 | \n",
- " -2.210781 | \n",
- " 0.830354 | \n",
- " 0.300203 | \n",
- "
\n",
- " \n",
- " 1027 | \n",
- " -1.406506 | \n",
- " 1.088179 | \n",
- " 0.939440 | \n",
- " 0.865011 | \n",
- " 0.155005 | \n",
- " -1.045775 | \n",
- " 0.316803 | \n",
- " 0.300203 | \n",
- "
\n",
- " \n",
- " 1028 | \n",
- " -1.274052 | \n",
- " 1.739242 | \n",
- " -0.822730 | \n",
- " -0.176187 | \n",
- " 1.272979 | \n",
- " 0.474207 | \n",
- " 0.453647 | \n",
- " 0.300203 | \n",
- "
\n",
- " \n",
- " 1029 | \n",
- " -0.001997 | \n",
- " 0.552738 | \n",
- " 0.447784 | \n",
- " 1.346033 | \n",
- " 0.692492 | \n",
- " -1.482066 | \n",
- " 0.032350 | \n",
- " 0.300203 | \n",
- "
\n",
- " \n",
- "
\n",
- "
1030 rows × 8 columns
\n",
- "
"
- ],
- "text/plain": [
- " cement slag ash water superplastic coarseagg \\\n",
- "0 0.147951 -0.830600 -0.822730 -1.004275 -0.618977 1.262346 \n",
- "1 0.147951 -0.830600 -0.822730 -1.004275 -0.618977 1.496911 \n",
- "2 0.892690 1.130849 -0.822730 0.396167 -1.156464 -0.426523 \n",
- "3 0.892690 1.130849 -0.822730 0.396167 -1.156464 -0.426523 \n",
- "4 -0.780475 0.991827 -0.822730 0.822389 -1.156464 0.299065 \n",
- "... ... ... ... ... ... ... \n",
- "1025 0.191685 0.766088 0.642499 0.067368 0.756991 -1.394495 \n",
- "1026 0.763985 -0.830600 1.053023 1.065944 1.079483 -2.210781 \n",
- "1027 -1.406506 1.088179 0.939440 0.865011 0.155005 -1.045775 \n",
- "1028 -1.274052 1.739242 -0.822730 -0.176187 1.272979 0.474207 \n",
- "1029 -0.001997 0.552738 0.447784 1.346033 0.692492 -1.482066 \n",
- "\n",
- " fineagg age \n",
- "0 -1.282281 0.300203 \n",
- "1 -1.282281 0.300203 \n",
- "2 -2.543096 0.300203 \n",
- "3 -2.543096 0.300203 \n",
- "4 1.016402 0.300203 \n",
- "... ... ... \n",
- "1025 0.136906 0.300203 \n",
- "1026 0.830354 0.300203 \n",
- "1027 0.316803 0.300203 \n",
- "1028 0.453647 0.300203 \n",
- "1029 0.032350 0.300203 \n",
- "\n",
- "[1030 rows x 8 columns]"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"X_scaled_df"
]
@@ -552,7 +355,7 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": null,
"metadata": {
"id": "gObLM3QlMqkw"
},
@@ -586,7 +389,7 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": null,
"metadata": {
"id": "XQ2br8k9NHfu"
},
@@ -609,7 +412,7 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": null,
"metadata": {
"id": "c2W5AhKUOCgy"
},
@@ -649,7 +452,7 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": null,
"metadata": {
"id": "CMAv3vTGOn6y"
},
@@ -660,7 +463,7 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -668,29 +471,7 @@
"id": "O4E4RHz_Oe2_",
"outputId": "4c509277-cc89-4f86-93d9-fb902715f2eb"
},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2023/07/16 18:02:12 INFO mlflow.tracking.fluent: Experiment with name 'Concrete-Strength-Experiments' does not exist. Creating a new experiment.\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Train Accuracy: 96.509%\n",
- "R2 Score: 0.847\n",
- "------------------------------\n",
- "MSE: 40.217\n",
- "------------------------------\n",
- "Metrics and artifacts logged!\n",
- "cv accuracy: 0.865300222145633\n",
- "Model run: 8977191a0af847ce892e666056fdfe66\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"mlflow.set_experiment('Concrete-Strength-Experiments')\n",
"\n",
@@ -750,7 +531,7 @@
},
{
"cell_type": "code",
- "execution_count": 22,
+ "execution_count": null,
"metadata": {
"id": "-qM94fgVVAPg"
},
@@ -762,7 +543,7 @@
},
{
"cell_type": "code",
- "execution_count": 23,
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -770,22 +551,7 @@
"id": "S9TPTjfyU8Hu",
"outputId": "af055a69-0df1-49a2-e8b4-600058e912ca"
},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Train Accuracy: 91.291%\n",
- "R2 Score: 0.830\n",
- "------------------------------\n",
- "MSE: 44.584\n",
- "------------------------------\n",
- "Metrics and artifacts logged!\n",
- "cv accuracy: 0.8554210267441394\n",
- "Model run: 9b1561393cb74b33b5b8e7ba5194e815\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# Start a new MLflow run\n",
"with mlflow.start_run():\n",
@@ -833,7 +599,7 @@
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": null,
"metadata": {
"id": "yHKzg4qWWb-m"
},
@@ -845,7 +611,7 @@
},
{
"cell_type": "code",
- "execution_count": 25,
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -853,22 +619,7 @@
"id": "kCmiu1XkWQav",
"outputId": "0f721df4-1dfd-4b9e-8073-5dcaf365243a"
},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Train Accuracy: 76.634%\n",
- "R2 Score: 0.711\n",
- "------------------------------\n",
- "MSE: 75.675\n",
- "------------------------------\n",
- "Metrics and artifacts logged!\n",
- "cv accuracy: 0.7301976290471304\n",
- "Model run: 5fcdcf5c1f2a4c5e9ba717b187a02734\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# Start a new MLflow run\n",
"with mlflow.start_run():\n",
@@ -917,7 +668,7 @@
},
{
"cell_type": "code",
- "execution_count": 26,
+ "execution_count": null,
"metadata": {
"id": "-QieEmgDMqlA"
},
@@ -944,7 +695,7 @@
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
@@ -953,18 +704,7 @@
"id": "PHt_x_hDMqlA",
"outputId": "8588acb4-48fe-4705-e1b8-255886f83416"
},
- "outputs": [
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"plt.figure(figsize = (12, 6))\n",
"plt.plot(range(1, 45), diff_k, color = 'blue', linestyle = 'dashed', marker = 'o', markerfacecolor = 'red', markersize = 10)\n",
@@ -975,7 +715,7 @@
},
{
"cell_type": "code",
- "execution_count": 28,
+ "execution_count": null,
"metadata": {
"id": "7ZQrM1JrXwI0"
},
@@ -987,7 +727,7 @@
},
{
"cell_type": "code",
- "execution_count": 29,
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -995,22 +735,7 @@
"id": "KZiZDhvmXw7I",
"outputId": "a1c484e9-7e73-40fb-b0c8-3482c8162584"
},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Train Accuracy: 89.268%\n",
- "R2 Score: 0.733\n",
- "------------------------------\n",
- "MSE: 70.035\n",
- "------------------------------\n",
- "Metrics and artifacts logged!\n",
- "cv accuracy: 0.5648537541937363\n",
- "Model run: 355930aab0bb4150b78f6287ea78ab42\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# Start a new MLflow run\n",
"with mlflow.start_run():\n",
@@ -1059,7 +784,7 @@
},
{
"cell_type": "code",
- "execution_count": 30,
+ "execution_count": null,
"metadata": {
"id": "cHj1u0PaYsdU"
},
@@ -1071,7 +796,7 @@
},
{
"cell_type": "code",
- "execution_count": 31,
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1079,22 +804,7 @@
"id": "-4Egu-dNY-s1",
"outputId": "309d1f06-8e22-4bbb-bbae-5b998acad056"
},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Train Accuracy: 95.763%\n",
- "R2 Score: 0.812\n",
- "------------------------------\n",
- "MSE: 49.370\n",
- "------------------------------\n",
- "Metrics and artifacts logged!\n",
- "cv accuracy: 0.717\n",
- "Model run: 9932d0ac56c242b58b2de46960bab005\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# Start a new MLflow run\n",
"with mlflow.start_run():\n",
@@ -1142,7 +852,7 @@
},
{
"cell_type": "code",
- "execution_count": 32,
+ "execution_count": null,
"metadata": {
"id": "xMj6UVCwZfU0"
},
@@ -1154,7 +864,7 @@
},
{
"cell_type": "code",
- "execution_count": 33,
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1162,22 +872,7 @@
"id": "6FYMCyx1Zhwd",
"outputId": "e8eb3752-6e35-4de4-ea95-dc1bdfed32f1"
},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Train Accuracy: 66.721%\n",
- "R2 Score: 0.569\n",
- "------------------------------\n",
- "MSE: 112.921\n",
- "------------------------------\n",
- "Metrics and artifacts logged!\n",
- "cv accuracy: 0.728\n",
- "Model run: 663fc1927b364acda888e43c638153f1\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# Start a new MLflow run\n",
"with mlflow.start_run():\n",
@@ -1229,7 +924,7 @@
},
{
"cell_type": "code",
- "execution_count": 34,
+ "execution_count": null,
"metadata": {
"id": "LbUPeJwjajYO"
},
@@ -1241,7 +936,7 @@
},
{
"cell_type": "code",
- "execution_count": 35,
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1249,22 +944,7 @@
"id": "nZTW4B5iauJz",
"outputId": "a6215850-5994-49cf-c0cd-453244647495"
},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Train Accuracy: 97.753%\n",
- "R2 Score: 0.832\n",
- "------------------------------\n",
- "MSE: 44.158\n",
- "------------------------------\n",
- "Metrics and artifacts logged!\n",
- "cv accuracy: 0.869\n",
- "Model run: 2f650498d25a482cb9f66c9f9bee6b96\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# Start a new MLflow run\n",
"with mlflow.start_run():\n",
@@ -1310,7 +990,7 @@
},
{
"cell_type": "code",
- "execution_count": 36,
+ "execution_count": null,
"metadata": {
"id": "OSrT6YBwbBi6"
},
@@ -1322,7 +1002,7 @@
},
{
"cell_type": "code",
- "execution_count": 37,
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1330,22 +1010,7 @@
"id": "f8MwYZ3ubNt-",
"outputId": "8a7b7201-f126-46bd-b80d-4041a69d5a33"
},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Train Accuracy: 97.812%\n",
- "R2 Score: 0.765\n",
- "------------------------------\n",
- "MSE: 61.654\n",
- "------------------------------\n",
- "Metrics and artifacts logged!\n",
- "cv accuracy: 0.7909050881453388\n",
- "Model run: 17ab311e3f534c12a688242d96080e03\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# Start a new MLflow run\n",
"with mlflow.start_run():\n",
@@ -1401,21 +1066,9 @@
},
{
"cell_type": "code",
- "execution_count": 39,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "image/jpeg": "",
- "text/plain": [
- ""
- ]
- },
- "execution_count": 39,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"Image(filename = './images/MLFlow-image-1.jpeg')"
]
@@ -1429,19 +1082,9 @@
},
{
"cell_type": "code",
- "execution_count": 40,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Successfully registered model 'XGBoost Model'.\n",
- "2023/07/16 18:22:52 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: XGBoost Model, version 1\n",
- "Created version '1' of model 'XGBoost Model'.\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"xgboost_run_id = '2f650498d25a482cb9f66c9f9bee6b96'\n",
"\n",
@@ -1451,17 +1094,9 @@
},
{
"cell_type": "code",
- "execution_count": 45,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2023/07/16 18:35:44 WARNING mlflow.sklearn: Model was missing function: predict. Not logging python_function flavor!\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"mlflow.sklearn.save_model(model_details, 'model')"
]
diff --git a/src/model_experiments/SVR.py b/src/model_experiments/SVR.py
new file mode 100644
index 0000000..cdd9335
--- /dev/null
+++ b/src/model_experiments/SVR.py
@@ -0,0 +1,63 @@
+import os
+import numpy as np
+import pandas as pd
+import argparse
+
+from modelevaluator import ModelEvaluator
+from logging import info
+from mlflow import log_metric, set_tag, log_param, start_run, active_run, end_run
+from mlflow.sklearn import log_model
+from sklearn.svm import SVR
+from sklearn.model_selection import KFold
+from sklearn.model_selection import cross_val_score
+
+# system
+os.chdir('../')
+
+# Get Data
+X = pd.read_csv('../data/2-silver/X.csv')
+X_train = pd.read_csv('../data/3-gold/X_train.csv')
+X_test = pd.read_csv('../data/3-gold/X_test.csv')
+
+y = pd.read_csv('../data/2-silver/y.csv')
+y_train = pd.read_csv('../data/3-gold/y_train.csv')
+y_test = pd.read_csv('../data/3-gold/y_test.csv')
+
+# Parse command-line arguments
+parser = argparse.ArgumentParser()
+parser.add_argument('--kernel', type = str, default = 'linear')
+args = parser.parse_args()
+
+# Instances
+SVR = SVR(kernel = args.kernel)
+evaluator = ModelEvaluator()
+
+# Experiment
+with start_run():
+ run_name = 'Random Forest'
+ set_tag('mlflow.runName', run_name)
+
+ # Train the model
+ evaluator.train(X_train, y_train)
+
+ # Lot hyperparameters
+ log_param('kernel', args.kernel)
+
+ # Perform cross-validation
+ k = 20
+ kfold = KFold(n_splits = k, random_state = 70, shuffle = True)
+ K_results = cross_val_score(SVR, X, y, cv = kfold)
+ accuracy = np.mean(abs(K_results))
+
+ # Log cross-validation-metrics
+ log_metric('cv_accuracy', accuracy)
+ info('cv accuracy loaded successfully.')
+
+ # Log the model
+ log_model(SVR, 'SVR')
+
+ # Print the run UUID
+ print('Model run: ', active_run().info.run_uuid)
+
+# End run
+end_run()
\ No newline at end of file
diff --git a/src/model_experiments/__init__.py b/src/model_experiments/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/model_experiments/adaboost.py b/src/model_experiments/adaboost.py
new file mode 100644
index 0000000..495ea93
--- /dev/null
+++ b/src/model_experiments/adaboost.py
@@ -0,0 +1,64 @@
+import os
+import numpy as np
+import pandas as pd
+import argparse
+
+from modelevaluator import ModelEvaluator
+from logging import info
+from mlflow import log_metric, set_tag, log_param, start_run, active_run, end_run
+from mlflow.sklearn import log_model
+from sklearn.ensemble import AdaBoostRegressor
+from sklearn.model_selection import KFold
+from sklearn.model_selection import cross_val_score
+
+# system
+os.chdir('../')
+
+# Get Data
+X = pd.read_csv('../data/2-silver/X.csv')
+X_train = pd.read_csv('../data/3-gold/X_train.csv')
+X_test = pd.read_csv('../data/3-gold/X_test.csv')
+
+y = pd.read_csv('../data/2-silver/y.csv')
+y_train = pd.read_csv('../data/3-gold/y_train.csv')
+y_test = pd.read_csv('../data/3-gold/y_test.csv')
+
+# Parse command-line arguments
+parser = argparse.ArgumentParser()
+parser.add_argument('--n_estimators', type = int, default = 50)
+args = parser.parse_args()
+
+# Instances
+ada_boost = AdaBoostRegressor(n_estimators = args.n_estimators)
+evaluator = ModelEvaluator()
+
+# Experiment
+with start_run():
+ # Set a custom run name
+ run_name = 'Ada Boost'
+ set_tag('mlflow.runName', run_name)
+
+ # Train the model
+ evaluator.train(X_train, y_train)
+
+ # Log hyperparameter
+ log_param('n_estimators', args.n_estimators)
+
+ # Perform cross-validation
+ k = 20
+ kfold = KFold(n_splits = k, random_state = 70, shuffle = True)
+ K_results = cross_val_score(ada_boost(X, y, cv = kfold))
+ accuracy = np.mean(abs(K_results))
+
+ # Log cross-validation-metrics
+ log_metric('cv_accuracy', accuracy)
+ info('cv_accuracy loaded successfully.')
+
+ # Lod the model
+ log_model(ada_boost, 'random-forest')
+
+ # Print the run UUID
+ print('Model run: ', active_run().info.run_uuid)
+
+# End run
+end_run()
\ No newline at end of file
diff --git a/src/model_experiments/bagging.py b/src/model_experiments/bagging.py
new file mode 100644
index 0000000..02727f8
--- /dev/null
+++ b/src/model_experiments/bagging.py
@@ -0,0 +1,63 @@
+import os
+import numpy as np
+import pandas as pd
+import argparse
+
+from modelevaluator import ModelEvaluator
+from logging import info
+from mlflow import log_metric, set_tag, log_param, start_run, active_run, end_run
+from mlflow.sklearn import log_model
+from sklearn.ensemble import BaggingRegressor
+from sklearn.model_selection import KFold
+from sklearn.model_selection import cross_val_score
+
+# system
+os.chdir('../')
+
+# Get Data
+X = pd.read_csv('../data/2-silver/X.csv')
+X_train = pd.read_csv('../data/3-gold/X_train.csv')
+X_test = pd.read_csv('../data/3-gold/X_test.csv')
+
+y = pd.read_csv('../data/2-silver/y.csv')
+y_train = pd.read_csv('../data/3-gold/y_train.csv')
+y_test = pd.read_csv('../data/3-gold/y_test.csv')
+
+# Parse command-line arguments
+parser = argparse.ArgumentParser()
+parser.add_argument('--n_estimators', type = int, default = 10)
+args = parser.parse_args()
+
+# Instances
+bagging = BaggingRegressor(n_estimators = args.n_estimators)
+evaluator = ModelEvaluator()
+
+# Experiment
+with start_run():
+ run_name = 'Bagging'
+ set_tag('mlflow.runName', run_name)
+
+ # Train the model
+ evaluator.train(X_train, y_train)
+
+ # Lot hyperparameters
+ log_param('n_estimators', args.n_estimators)
+
+ # Perform cross-validation
+ k = 20
+ kfold = KFold(n_splits = k, random_state = 70, shuffle = True)
+ K_results = cross_val_score(bagging, X, y, cv = kfold)
+ accuracy = np.mean(abs(K_results))
+
+ # Log cross-validation-metrics
+ log_metric('cv_accuracy', accuracy)
+ info('cv accuracy loaded successfully.')
+
+ # Log the model
+ log_model(bagging, 'bagging')
+
+ # Print the run UUID
+ print('Model run: ', active_run().info.run_uuid)
+
+# End run
+end_run()
\ No newline at end of file
diff --git a/src/model_experiments/decisiontree.py b/src/model_experiments/decisiontree.py
new file mode 100644
index 0000000..c8c9c67
--- /dev/null
+++ b/src/model_experiments/decisiontree.py
@@ -0,0 +1,72 @@
+import os
+import numpy as np
+import pandas as pd
+import argparse
+
+from modelevaluator import ModelEvaluator
+from logging import info
+from mlflow import log_metric, set_tag, log_param, start_run, active_run, end_run
+from mlflow.sklearn import log_model
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.model_selection import KFold
+from sklearn.model_selection import cross_val_score
+
+# system
+os.chdir('../')
+
+# Get Data
+X = pd.read_csv('../data/2-silver/X.csv')
+X_train = pd.read_csv('../data/3-gold/X_train.csv')
+X_test = pd.read_csv('../data/3-gold/X_test.csv')
+
+y = pd.read_csv('../data/2-silver/y.csv')
+y_train = pd.read_csv('../data/3-gold/y_train.csv')
+y_test = pd.read_csv('../data/3-gold/y_test.csv')
+
+# Parse command-line arguments
+parser = argparse.ArgumentParser()
+parser.add_argument('--min_samples_leaf', type = int, default = 1)
+parser.add_argument('--min_samples_split', type = int, default = 2)
+parser.add_argument('--max_features', type = int, default = None)
+parser.add_argument('--max_depth', type = int, default = None)
+args = parser.parse_args()
+
+# Instances
+dt_model = DecisionTreeRegressor(min_samples_leaf = args.min_samples_leaf,
+ min_samples_split = args.min_samples_split,
+ max_depth = args.max_depth,
+ max_features = args.max_features)
+evaluator = ModelEvaluator()
+
+# Experiment
+with start_run():
+ run_name = 'Random Forest'
+ set_tag('mlflow.runName', run_name)
+
+ # Train the model
+ evaluator.train(X_train, y_train)
+
+ # Log hyperparameters
+ log_param("max_depth", args.max_depth)
+ log_param("min_samples_split", args.min_samples_split)
+ log_param("min_samples_leaf", args.min_samples_leaf)
+ log_param("max_features", args.max_features)
+
+ # Perform cross-validation
+ k = 20
+ kfold = KFold(n_splits = k, random_state = 70, shuffle = True)
+ K_results = cross_val_score(dt_model, X, y, cv = kfold)
+ accuracy = np.mean(abs(K_results))
+
+ # Log cross-validation-metrics
+ log_metric('cv_accuracy', accuracy)
+ info('cv accuracy loaded successfully.')
+
+ # Log the model
+ log_model(dt_model, 'random-forest')
+
+ # Print the run UUID
+ print('Model run: ', active_run().info.run_uuid)
+
+# End run
+end_run()
\ No newline at end of file
diff --git a/src/model_experiments/gradientboost.py b/src/model_experiments/gradientboost.py
new file mode 100644
index 0000000..36c7886
--- /dev/null
+++ b/src/model_experiments/gradientboost.py
@@ -0,0 +1,64 @@
+import os
+import numpy as np
+import pandas as pd
+import argparse
+
+from modelevaluator import ModelEvaluator
+from logging import info
+from mlflow import log_metric, set_tag, log_param, start_run, active_run, end_run
+from mlflow.sklearn import log_model
+from sklearn.ensemble import GradientBoostRegressor
+from sklearn.model_selection import KFold
+from sklearn.model_selection import cross_val_score
+
+# system
+os.chdir('../')
+
+# Get Data
+X = pd.read_csv('../data/2-silver/X.csv')
+X_train = pd.read_csv('../data/3-gold/X_train.csv')
+X_test = pd.read_csv('../data/3-gold/X_test.csv')
+
+y = pd.read_csv('../data/2-silver/y.csv')
+y_train = pd.read_csv('../data/3-gold/y_train.csv')
+y_test = pd.read_csv('../data/3-gold/y_test.csv')
+
+# Parse command-line arguments
+parser = argparse.ArgumentParser()
+parser.add_argument('--n_estimators', type = int, default = 10)
+args = parser.parse_args()
+
+# Instances
+gradient_boost = GradientBoostRegressor(n_estimators = args.n_estimators)
+evaluator = ModelEvaluator()
+
+# Experiment
+with start_run():
+ # Set a custom run name
+ run_name = 'Gradient Boost'
+ set_tag('mlflow.runName', run_name)
+
+ # Train the model
+ evaluator.train(X_train, y_train)
+
+ # Log hyperparameter
+ log_param('n_estimators', args.n_estimators)
+
+ # Perform cross-validation
+ k = 20
+ kfold = KFold(n_splits = k, random_state = 70, shuffle = True)
+ K_results = cross_val_score(gradient_boost(X, y, cv = kfold))
+ accuracy = np.mean(abs(K_results))
+
+ # Log cross-validation-metrics
+ log_metric('cv_accuracy', accuracy)
+ info('cv_accuracy loaded successfully.')
+
+ # Lod the model
+ log_model(gradient_boost, 'gradient-boost')
+
+ # Print the run UUID
+ print('Model run: ', active_run().info.run_uuid)
+
+# End run
+end_run()
\ No newline at end of file
diff --git a/src/model_experiments/knn.py b/src/model_experiments/knn.py
new file mode 100644
index 0000000..536a6c0
--- /dev/null
+++ b/src/model_experiments/knn.py
@@ -0,0 +1,64 @@
+import os
+import numpy as np
+import pandas as pd
+import argparse
+
+from modelevaluator import ModelEvaluator
+from logging import info
+from mlflow import log_metric, set_tag, log_param, start_run, active_run, end_run
+from mlflow.sklearn import log_model
+from sklearn.neighbors import KNeighborsRegressor
+from sklearn.model_selection import KFold
+from sklearn.model_selection import cross_val_score
+
+# system
+os.chdir('../')
+
+# Get Data
+X = pd.read_csv('../data/2-silver/X.csv')
+X_train = pd.read_csv('../data/3-gold/X_train.csv')
+X_test = pd.read_csv('../data/3-gold/X_test.csv')
+
+y = pd.read_csv('../data/2-silver/y.csv')
+y_train = pd.read_csv('../data/3-gold/y_train.csv')
+y_test = pd.read_csv('../data/3-gold/y_test.csv')
+
+# Parse command-line arguments
+parser = argparse.ArgumentParser()
+parser.add_argument('--n_neighbors', type = int, default = 3)
+args = parser.parse_args()
+
+# Instances
+KNN = KNeighborsRegressor(n_neighbors = args.n_neighbors)
+evaluator = ModelEvaluator()
+
+# Experiment
+with start_run():
+ # Set a custom run name
+ run_name = 'Gradient Boost'
+ set_tag('mlflow.runName', run_name)
+
+ # Train the model
+ evaluator.train(X_train, y_train)
+
+ # Log hyperparameter
+ log_param('n_estimators', args.n_neighbors)
+
+ # Perform cross-validation
+ k = 20
+ kfold = KFold(n_splits = k, random_state = 70, shuffle = True)
+ K_results = cross_val_score(KNN(X, y, cv = kfold))
+ accuracy = np.mean(abs(K_results))
+
+ # Log cross-validation-metrics
+ log_metric('cv_accuracy', accuracy)
+ info('cv_accuracy loaded successfully.')
+
+ # Lod the model
+ log_model(KNN, 'KNN')
+
+ # Print the run UUID
+ print('Model run: ', active_run().info.run_uuid)
+
+# End run
+end_run()
\ No newline at end of file
diff --git a/src/MlfExpetiments.py b/src/model_experiments/modelevaluator.py
similarity index 59%
rename from src/MlfExpetiments.py
rename to src/model_experiments/modelevaluator.py
index b32f0c1..6524b90 100644
--- a/src/MlfExpetiments.py
+++ b/src/model_experiments/modelevaluator.py
@@ -1,18 +1,24 @@
-import logging
-from dataclasses import dataclass
-import mlflow
+from logging import info
+from mlflow import log_metric
+import numpy as np
import pandas as pd
-from sklearn import metrics
+from sklearn.metrics import mean_squared_error
+from mlflow import log_metric, set_tag, log_param, start_run, active_run, end_run
+from mlflow.sklearn import log_model
+from sklearn.metrics import r2_score
from sklearn.base import BaseEstimator
+from sklearn.model_selection import KFold, cross_val_score
-@dataclass
class ModelEvaluator:
'''
Utility class for training and evaluating scikit-learn models.
'''
+ def __init__(self):
+ pass
- def train(self, model: BaseEstimator, X_train: pd.DataFrame, y_train: pd.DataFrame) -> None:
+ def train(self, model: BaseEstimator, X_train: pd.DataFrame,
+ y_train: pd.DataFrame) -> None:
'''
Fits a scikit-learn model.
@@ -27,14 +33,15 @@ def train(self, model: BaseEstimator, X_train: pd.DataFrame, y_train: pd.DataFra
try:
model = model.fit(X_train, y_train)
train_accuracy = model.score(X_train, y_train)
- mlflow.log_metric('train-accuracy', train_accuracy)
- logging.info(f'Train Accuracy: {train_accuracy:.3%}')
+ log_metric('train-accuracy', train_accuracy)
+ info(f'Train Accuracy: {train_accuracy:.2%}')
except Exception as e:
raise e
return None
- def evaluate(self, model: BaseEstimator, X_test: pd.DataFrame, y_test: pd.DataFrame) -> None:
+ def evaluate(self, model: BaseEstimator, X_test: pd.DataFrame,
+ y_test: pd.DataFrame) -> None:
'''
Evaluates a scikit-learn model.
@@ -51,21 +58,21 @@ def evaluate(self, model: BaseEstimator, X_test: pd.DataFrame, y_test: pd.DataFr
y_pred = model.predict(X_test)
# Model performance metrics
- r2_score = metrics.r2_score(y_test, y_pred)
- mse_score = metrics.mean_squared_error(y_test, y_pred)
+ r2_score = r2_score(y_test, y_pred)
+ mse_score = mean_squared_error(y_test, y_pred)
# Log metrics
- mlflow.log_metric('r2-score', r2_score)
- mlflow.log_metric('mse', mse_score)
+ log_metric('r2-score', r2_score)
+ log_metric('mse', mse_score)
# Print and log metrics
- logging.info('R2 Score: {:.3f}'.format(r2_score))
- logging.info('-' * 30)
- logging.info('MSE: {:.3f}'.format(mse_score))
+ info('R2 Score: {:.2f}'.format(r2_score))
+ info('MSE: {:.2f}'.format(mse_score))
- logging.info('-' * 30)
- logging.info('Metrics and artifacts logged!')
+ info('Metrics and artifacts logged!')
except Exception as e:
raise e
return None
+
+
diff --git a/src/model_experiments/randomforest.py b/src/model_experiments/randomforest.py
new file mode 100644
index 0000000..efaecfa
--- /dev/null
+++ b/src/model_experiments/randomforest.py
@@ -0,0 +1,66 @@
+import os
+import numpy as np
+import pandas as pd
+import argparse
+
+from modelevaluator import ModelEvaluator
+from logging import info
+from mlflow import log_metric, set_tag, log_param, start_run, active_run, end_run
+from mlflow.sklearn import log_model
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.model_selection import KFold
+from sklearn.model_selection import cross_val_score
+
+# system
+os.chdir('../')
+
+# Get Data
+X = pd.read_csv('../data/2-silver/X.csv')
+X_train = pd.read_csv('../data/3-gold/X_train.csv')
+X_test = pd.read_csv('../data/3-gold/X_test.csv')
+
+y = pd.read_csv('../data/2-silver/y.csv')
+y_train = pd.read_csv('../data/3-gold/y_train.csv')
+y_test = pd.read_csv('../data/3-gold/y_test.csv')
+
+# Parse command-line arguments
+parser = argparse.ArgumentParser()
+parser.add_argument('--n_estimators', type = int, default = 100)
+parser.add_argument('--max_depth', type = int, default = None)
+args = parser.parse_args()
+
+# Instances
+random_forest = RandomForestRegressor(n_estimators = args.n_estimators,
+ max_depth = args.max_depth)
+evaluator = ModelEvaluator()
+
+# Experiment
+with start_run():
+ run_name = 'Random Forest'
+ set_tag('mlflow.runName', run_name)
+
+ # Train the model
+ evaluator.train(X_train, y_train)
+
+ # Lot hyperparameters
+ log_param('n_estimators', args.n_estimators)
+ log_param('max_depth', args.max_depth)
+
+ # Perform cross-validation
+ k = 20
+ kfold = KFold(n_splits = k, random_state = 70, shuffle = True)
+ K_results = cross_val_score(random_forest, X, y, cv = kfold)
+ accuracy = np.mean(abs(K_results))
+
+ # Log cross-validation-metrics
+ log_metric('cv_accuracy', accuracy)
+ info('cv accuracy loaded successfully.')
+
+ # Log the model
+ log_model(random_forest, 'random-forest')
+
+ # Print the run UUID
+ print('Model run: ', active_run().info.run_uuid)
+
+# End run
+end_run()
\ No newline at end of file
diff --git a/src/model_experiments/xgr.py b/src/model_experiments/xgr.py
new file mode 100644
index 0000000..cd465f3
--- /dev/null
+++ b/src/model_experiments/xgr.py
@@ -0,0 +1,55 @@
+import os
+import numpy as np
+import pandas as pd
+
+from modelevaluator import ModelEvaluator
+from logging import info
+from mlflow import log_metric, set_tag, start_run, active_run, end_run
+from mlflow.sklearn import log_model
+from xgboost.sklearn import XGBRegressor
+from sklearn.model_selection import KFold
+from sklearn.model_selection import cross_val_score
+
+# system
+os.chdir('../')
+
+# Get Data
+X = pd.read_csv('../data/2-silver/X.csv')
+X_train = pd.read_csv('../data/3-gold/X_train.csv')
+X_test = pd.read_csv('../data/3-gold/X_test.csv')
+
+y = pd.read_csv('../data/2-silver/y.csv')
+y_train = pd.read_csv('../data/3-gold/y_train.csv')
+y_test = pd.read_csv('../data/3-gold/y_test.csv')
+
+
+# Instances
+xgr = XGBRegressor()
+evaluator = ModelEvaluator()
+
+# Experiment
+with start_run():
+ run_name = 'XGBoost'
+ set_tag('mlflow.runName', run_name)
+
+ # Train the model
+ evaluator.train(X_train, y_train)
+
+ # Perform cross-validation
+ k = 20
+ kfold = KFold(n_splits = k, random_state = 70, shuffle = True)
+ K_results = cross_val_score(xgr, X, y, cv = kfold)
+ accuracy = np.mean(abs(K_results))
+
+ # Log cross-validation-metrics
+ log_metric('cv_accuracy', accuracy)
+ info('cv accuracy loaded successfully.')
+
+ # Log the model
+ log_model(xgr, 'xgboost')
+
+ # Print the run UUID
+ print('Model run: ', active_run().info.run_uuid)
+
+# End run
+end_run()
\ No newline at end of file
diff --git a/src/preprocessing/bronze_to_silver.py b/src/preprocessing/bronze_to_silver.py
new file mode 100644
index 0000000..e4764e4
--- /dev/null
+++ b/src/preprocessing/bronze_to_silver.py
@@ -0,0 +1,72 @@
+import dask.dataframe as dd
+import logging
+import os
+from dask.distributed import Client
+from typing import Tuple
+
+# Set up logging
+logs_dir = os.path.abspath('../../logs/')
+os.makedirs(logs_dir, exist_ok=True)
+logging.basicConfig(filename=os.path.join(logs_dir, 'dask_data_processing.log'),
+ level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+# Set up logging to display logs in the CLI
+console = logging.StreamHandler()
+console.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+console.setFormatter(formatter)
+logging.getLogger('').addHandler(console)
+
+def process_data(file_path: str) -> Tuple[dd.DataFrame, dd.DataFrame]:
+ '''
+ Process data from a Dask DataFrame stored in a Parquet file.
+
+ Parameters:
+ file_path (str): Path to the Parquet file containing the data.
+
+ Returns:
+ tuple: A tuple containing the Dask DataFrame for independent variables (X) and
+ the Dask DataFrame for the dependent variable (y).
+ '''
+ # Log the start of data processing
+ logging.info(f'Processing data from file: {file_path}')
+
+ try:
+ # Read the Parquet file into a Dask DataFrame
+ df = dd.read_parquet(file_path)
+
+ # Split data into dependent (y) and independent (X) variables
+ X = df[['cement', 'slag', 'ash', 'water', 'superplastic', 'coarseagg', 'fineagg', 'age']]
+ y = df[['strength']]
+
+ return X, y
+ except Exception as e:
+ logging.error(f'It was not possible to process data: {file_path}. Error: {e}')
+
+if __name__ == '__main__':
+ # Provide the path to the Parquet file containing the data
+ data_file_path = os.path.abspath('data/1-bronze/Concrete_Data_Cleaned.parquet')
+
+ # Connect to the Dask cluster
+ client = Client(n_workers = 4)
+
+ # Process data using the function
+ X, y = process_data(data_file_path)
+
+ # File paths to be saved
+ x_file_path = os.path.abspath('data/2-silver/X.parquet')
+ y_file_path = os.path.abspath('data/2-silver/Y.parquet')
+
+ # Convert Dask DataFrames to Pandas DataFrames
+ X_pandas = X.compute()
+ y_pandas = y.compute()
+
+ # Save the Pandas DataFrames X and y to Parquet files
+ X_pandas.to_parquet(x_file_path, index = False)
+ y_pandas.to_parquet(y_file_path, index = False)
+
+ # Log the completion of data processing
+ logging.info('Data split completed.')
+
+ # Shutdown the Dask client
+ client.shutdown()
diff --git a/src/preprocessing/raw_to_bronze.py b/src/preprocessing/raw_to_bronze.py
new file mode 100644
index 0000000..4fa555d
--- /dev/null
+++ b/src/preprocessing/raw_to_bronze.py
@@ -0,0 +1,79 @@
+import dask.dataframe as dd
+import pandas as pd
+import logging
+import os
+
+from dask.distributed import Client
+
+# Set up logging
+logs_dir = os.path.abspath('../../logs/')
+os.makedirs(logs_dir, exist_ok=True)
+logging.basicConfig(filename=os.path.join(logs_dir, 'dask_data_processing.log'),
+ level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+# Set up logging to display logs in the CLI
+console = logging.StreamHandler()
+console.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+console.setFormatter(formatter)
+logging.getLogger('').addHandler(console)
+
+
+def process_data(file_path: str) -> dd.DataFrame:
+ '''
+ Process raw data from a CSV file.
+
+ Parameters:
+ file_path (str): Path to the CSV file.
+
+ Returns:
+ dd.DataFrame: Processed DataFrame.
+ '''
+ logging.info(f"Processing data from file: {file_path}")
+ df = dd.read_csv(file_path)
+
+ df = df.rename(columns={'Cement (component 1)(kg in a m^3 mixture)': 'cement',
+ 'Blast Furnace Slag (component 2)(kg in a m^3 mixture)': 'slag',
+ 'Fly Ash (component 3)(kg in a m^3 mixture)': 'ash',
+ 'Water (component 4)(kg in a m^3 mixture)': 'water',
+ 'Superplasticizer (component 5)(kg in a m^3 mixture)': 'superplastic',
+ 'Coarse Aggregate (component 6)(kg in a m^3 mixture)': 'coarseagg',
+ 'Fine Aggregate (component 7)(kg in a m^3 mixture)': 'fineagg',
+ 'Age (day)': 'age',
+ 'Concrete compressive strength(MPa, megapascals) ': 'strength'})
+
+ # Perform replacement of ',' with '.' and convert numeric columns to float64
+ for column in df.columns:
+ if column != 'strength': # We skip the 'strength' column in this conversion
+ df[column] = df[column].replace({',': '.'}, regex = True).astype('float64')
+
+ for cols in df.columns[:-1]:
+ # calculating quartiles
+ Q1 = df[cols].quantile(0.25)
+ Q3 = df[cols].quantile(0.75)
+ # iqr range
+ iqr = Q3 - Q1
+
+ # calculating the low and high limits
+ low = Q1 - 1.5 * iqr
+ high = Q1 + 1.5 * iqr
+
+ # replacing outliers with the median value
+ df[cols] = df[cols].where((df[cols] >= low) & (df[cols] <= high), df[cols].median())
+
+ return df
+
+if __name__ == '__main__':
+ data_file_path = os.path.abspath('data/0-raw-data/Concrete_Data.csv')
+ client = Client(n_workers=4)
+
+ processed_data = process_data(data_file_path)
+ data_processed = processed_data.compute()
+
+ # Create an absolute path for the Parquet file
+ parquet_file_path = os.path.abspath('data/1-bronze/Concrete_Data_Cleaned.parquet')
+
+ # Save the processed data as Parquet
+ data_processed.to_parquet(parquet_file_path, index=False)
+
+ logging.info("Data processing completed and saved to Parquet.")
diff --git a/src/preprocessing/silver_to_gold.py b/src/preprocessing/silver_to_gold.py
new file mode 100644
index 0000000..13ea092
--- /dev/null
+++ b/src/preprocessing/silver_to_gold.py
@@ -0,0 +1,86 @@
+import dask.dataframe as dd
+import logging
+import os
+from dask.distributed import Client
+from typing import Tuple
+from sklearn.model_selection import train_test_split as sk_train_test_split
+
+# Set up logging
+logs_dir = os.path.abspath('../../logs/')
+os.makedirs(logs_dir, exist_ok=True)
+logging.basicConfig(filename=os.path.join(logs_dir, 'dask_data_processing.log'),
+ level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+# Set up logging to display logs in the CLI
+console = logging.StreamHandler()
+console.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+console.setFormatter(formatter)
+logging.getLogger('').addHandler(console)
+
+def process_data(file_path_x: str,
+ file_path_y: str) -> Tuple[dd.DataFrame, dd.DataFrame,
+ dd.DataFrame, dd.DataFrame]:
+ '''
+ Process data
+
+ Parameters:
+ file_path_x (str): Path to the Parquet file containing the independent variables (X).
+ file_path_y (str): Path to the Parquet file containing the dependent variable (y).
+
+ Returns:
+ Tuple[dd.DataFrame, dd.DataFrame, dd.DataFrame, dd.DataFrame]: A tuple containing the Dask DataFrame
+ for X_train, X_test, y_train, and y_test.
+ '''
+ # Log the start of data processing
+ logging.info(f'Processing data from files: {file_path_x}, {file_path_y}')
+
+ X = dd.read_parquet(file_path_x)
+ y = dd.read_parquet(file_path_y)
+
+ # Apply the z-score to X
+ X_scaled = (X - X.mean()) / X.std()
+
+ # Convert Dask DataFrames to Pandas DataFrames
+ logging.info('Converting Dask DataFrames to Pandas DataFrames...')
+ X_scaled = X_scaled.compute()
+ y = y.compute()
+
+ # Train and test split
+ logging.info("Splitting data into train and test sets...")
+ X_train, X_test, y_train, y_test = sk_train_test_split(X_scaled, y, random_state=1, test_size=0.3)
+
+ # Log the completion of data processing
+ logging.info('Data processing completed.')
+
+ return X_train, X_test, y_train, y_test
+
+if __name__ == '__main__':
+ # Provide the paths to the Parquet files containing the data
+ file_path_x = os.path.abspath('data/2-silver/X.parquet')
+ file_path_y = os.path.abspath('data/2-silver/Y.parquet')
+
+ # File paths to be saved
+ file_path_X_train = os.path.abspath('data/3-gold/X_train.parquet')
+ file_path_X_test = os.path.abspath('data/3-gold/X_test.parquet')
+ file_path_y_train = os.path.abspath('data/3-gold/y_train.parquet')
+ file_path_y_test = os.path.abspath('data/3-gold/y_test.parquet')
+
+ # Connect to Dask Cluster
+ logging.info('Connecting to the Dask cluster...')
+ client = Client(n_workers=4)
+
+ X_train, X_test, y_train, y_test = process_data(file_path_x, file_path_y)
+
+ # Save the Pandas DataFrames to Parquet files
+ logging.info('Saving DataFrames to Parquet files...')
+ X_train.to_parquet(file_path_X_train, index=False)
+ X_test.to_parquet(file_path_X_test, index=False)
+ y_train.to_parquet(file_path_y_train, index=False)
+ y_test.to_parquet(file_path_y_test, index=False)
+
+ # Log the completion of data processing
+ logging.info('Data split completed.')
+
+ # Shut down the Dask Client
+ client.shutdown()
diff --git a/src/requirements.txt b/src/requirements.txt
index f35e505..fc14e56 100644
--- a/src/requirements.txt
+++ b/src/requirements.txt
@@ -2,6 +2,7 @@ alembic==1.11.1
asttokens==2.2.1
backcall==0.2.0
blinker==1.6.2
+bokeh==3.2.1
certifi==2023.5.7
charset-normalizer==3.2.0
click==8.1.5
@@ -10,14 +11,19 @@ colorama==0.4.6
comm==0.1.3
contourpy==1.1.0
cycler==0.11.0
+dask==2023.7.1
+dask-glm==0.2.0
+dask-ml==2023.3.24
databricks-cli==0.17.7
debugpy==1.6.7
decorator==5.1.1
+distributed==2023.7.1
docker==6.1.3
entrypoints==0.4
executing==1.2.0
Flask==2.3.2
fonttools==4.40.0
+fsspec==2023.6.0
gitdb==4.0.10
GitPython==3.1.32
greenlet==2.0.2
@@ -32,18 +38,25 @@ joblib==1.2.0
jupyter_client==8.2.0
jupyter_core==5.3.1
kiwisolver==1.4.4
+llvmlite==0.40.1
+locket==1.0.0
+lz4==4.3.2
Mako==1.2.4
Markdown==3.4.3
MarkupSafe==2.1.3
matplotlib==3.7.1
matplotlib-inline==0.1.6
mlflow==2.4.2
+msgpack==1.0.5
+multipledispatch==1.0.0
nest-asyncio==1.5.6
-numpy==1.25.0
+numba==0.57.1
+numpy==1.24.4
oauthlib==3.2.2
packaging==23.1
-pandas==2.0.2
+pandas==2.0.3
parso==0.8.3
+partd==1.4.0
pickleshare==0.7.5
Pillow==9.5.0
platformdirs==3.6.0
@@ -67,11 +80,14 @@ scipy==1.10.1
seaborn==0.12.2
six==1.16.0
smmap==5.0.0
+sortedcontainers==2.4.0
SQLAlchemy==2.0.19
sqlparse==0.4.4
stack-data==0.6.2
tabulate==0.9.0
+tblib==2.0.0
threadpoolctl==3.1.0
+toolz==0.12.0
tornado==6.3.2
traitlets==5.9.0
typing_extensions==4.7.1
@@ -82,4 +98,6 @@ wcwidth==0.2.6
websocket-client==1.6.1
Werkzeug==2.3.6
xgboost==1.7.6
+xyzservices==2023.7.0
+zict==3.0.0
zipp==3.16.2