From 8c1d80c3d7b39a074cacdfe8514dc7d1a7171618 Mon Sep 17 00:00:00 2001 From: shishir kudchadker Date: Fri, 18 Nov 2022 07:32:49 +0530 Subject: [PATCH 1/4] First commit with setup and DVC files --- .dvc/.gitignore | 3 +++ .dvc/config | 5 +++++ .dvcignore | 3 +++ data/raw/.gitignore | 2 ++ data/raw/train.dvc | 5 +++++ data/raw/val.dvc | 5 +++++ 6 files changed, 23 insertions(+) create mode 100644 .dvc/.gitignore create mode 100644 .dvc/config create mode 100644 .dvcignore create mode 100644 data/raw/train.dvc create mode 100644 data/raw/val.dvc diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 00000000..528f30c7 --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 00000000..7a5426e9 --- /dev/null +++ b/.dvc/config @@ -0,0 +1,5 @@ +[core] + analytics = false + remote = remote_storage +['remote "remote_storage"'] + url = /home/mlops/dvc_remote diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 00000000..51973055 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore diff --git a/data/raw/.gitignore b/data/raw/.gitignore index e69de29b..a5d9d98f 100644 --- a/data/raw/.gitignore +++ b/data/raw/.gitignore @@ -0,0 +1,2 @@ +/train +/val diff --git a/data/raw/train.dvc b/data/raw/train.dvc new file mode 100644 index 00000000..14162364 --- /dev/null +++ b/data/raw/train.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 7adc7abb69056f4d7afb512c78f2fce9.dir + size: 75309082 + nfiles: 9470 + path: train diff --git a/data/raw/val.dvc b/data/raw/val.dvc new file mode 100644 index 00000000..237148f4 --- /dev/null +++ b/data/raw/val.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 0ad4dcf197b452735726bf8d8777201d.dir + size: 31248080 + nfiles: 3925 + path: val From f4617abbb94261487ce1e1a256bc3ec20268db20 Mon Sep 17 00:00:00 2001 From: shishir kudchadker Date: Sat, 19 Nov 2022 07:34:06 +0530 Subject: [PATCH 2/4] Created train and test csv files --- data/prepared/.gitignore | 2 ++ data/prepared/test.csv.dvc | 4 ++++ data/prepared/train.csv.dvc | 4 ++++ 3 files changed, 10 insertions(+) create mode 100644 data/prepared/test.csv.dvc create mode 100644 data/prepared/train.csv.dvc diff --git a/data/prepared/.gitignore b/data/prepared/.gitignore index e69de29b..22a65dd9 100644 --- a/data/prepared/.gitignore +++ b/data/prepared/.gitignore @@ -0,0 +1,2 @@ +/train.csv +/test.csv diff --git a/data/prepared/test.csv.dvc b/data/prepared/test.csv.dvc new file mode 100644 index 00000000..3b350283 --- /dev/null +++ b/data/prepared/test.csv.dvc @@ -0,0 +1,4 @@ +outs: +- md5: cbd4ba69ced15e40820a635e7e741627 + size: 71491 + path: test.csv diff --git a/data/prepared/train.csv.dvc b/data/prepared/train.csv.dvc new file mode 100644 index 00000000..ae08b157 --- /dev/null +++ b/data/prepared/train.csv.dvc @@ -0,0 +1,4 @@ +outs: +- md5: 7f79cf9a4ab1316f7d7246d0b92ea16c + size: 178060 + path: train.csv From 2277f670cb34adb86a666ec24ff056becc3ad358 Mon Sep 17 00:00:00 2001 From: shishir kudchadker Date: Sat, 19 Nov 2022 17:27:59 +0530 Subject: [PATCH 3/4] Trained an SGD classifier --- model/.gitignore | 1 + model/model.joblib.dvc | 4 ++++ src/train.py | 2 +- 3 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 model/model.joblib.dvc diff --git a/model/.gitignore b/model/.gitignore index e69de29b..565a9d50 100644 --- a/model/.gitignore +++ b/model/.gitignore @@ -0,0 +1 @@ +/model.joblib diff --git a/model/model.joblib.dvc b/model/model.joblib.dvc new file mode 100644 index 00000000..4b96d593 --- /dev/null +++ b/model/model.joblib.dvc @@ -0,0 +1,4 @@ +outs: +- md5: 22490d1b369e3f7423c5b6ebd4db4234 + size: 241075 + path: model.joblib diff --git a/src/train.py b/src/train.py index e5feeda6..5d3c7c2c 100644 --- a/src/train.py +++ b/src/train.py @@ -37,7 +37,7 @@ def load_data(data_path): def main(repo_path): train_csv_path = repo_path / "data/prepared/train.csv" train_data, labels = load_data(train_csv_path) - sgd = SGDClassifier(max_iter=10) + sgd = SGDClassifier(max_iter=100) trained_model = sgd.fit(train_data, labels) dump(trained_model, repo_path / "model/model.joblib") From 5d0b3e5928fa8b064c425df4e5025b43002f3516 Mon Sep 17 00:00:00 2001 From: shishir kudchadker Date: Sat, 19 Nov 2022 17:34:33 +0530 Subject: [PATCH 4/4] Evaluate the SGD MOdel accuracy --- metrics/accuracy.json | 1 + 1 file changed, 1 insertion(+) create mode 100644 metrics/accuracy.json diff --git a/metrics/accuracy.json b/metrics/accuracy.json new file mode 100644 index 00000000..39176609 --- /dev/null +++ b/metrics/accuracy.json @@ -0,0 +1 @@ +{"accuracy": 0.7490494296577946} \ No newline at end of file