Data Science Projects

Abebawu Eshetu

Author: Abebawu Eshetu

Research Interest: Natural Language Processing, Machine Learning, and Computer Vision for Social Goods.

# import required libraries
import warnings
warnings.filterwarnings("ignore") #UndefinedMetricWarning
import re
import os 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
import tensorflow as tf 
from pathlib import Path
from tqdm import tqdm
from datetime import datetime
import cv2
# import the necessary packages
from imutils import paths
import shutil
import os

# grab all image paths in the current split
print("[INFO] processing '{} dataset'...".format(TRAIN))
for classes in os.listdir(data_dir_train):
    print("[INFO] loading images {}...".format(classes))
    classPath=os.path.join(data_dir_train,classes)
    imagePaths = list(paths.list_images(classPath))
    for imagePath in imagePaths:
        # extract class label from the filename
        filename = imagePath.split(os.path.sep)[-1]
        # construct the path to the output directory
        dirPath = os.path.sep.join([BASE_PATH, TRAIN, classes])
        # if the output directory does not exist, create it
        if not os.path.exists(dirPath):
            os.makedirs(dirPath)
        # construct the path to the output image file and copy it
        p = os.path.sep.join([dirPath, filename])
        shutil.copy2(imagePath, p)
data_dir_train = "train/train/"
data_dir_test = "test/test/"
# import the necessary packages
from imutils import paths
import shutil
import os
# grab all image paths in the current split
print("[INFO] processing '{} dataset'...".format(VAL))
for image in os.listdir(data_dir_test):
    imagePath=os.path.join(data_dir_test,image)
    # extract class label from the filename
#     filename = imagePath.split(os.path.sep)[-1]
    # construct the path to the output directory
    dirPath = os.path.sep.join([BASE_PATH, VAL])
    # if the output directory does not exist, create it
    if not os.path.exists(dirPath):
        os.makedirs(dirPath)
    # construct the path to the output image file and copy it
    p = os.path.sep.join([dirPath, image])
    shutil.copy2(imagePath, p)
[INFO] processing 'testing dataset'...
# import the necessary packages
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input, decode_predictions
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.preprocessing.image import load_img
from imutils import paths
import pickle
import random
# load the VGG16 network and initialize the label encoder
print("[INFO] loading network...")
model = VGG16(weights="imagenet", include_top=False)
le = None
[INFO] loading network...
# define the names of the training, testing, and validation
# directories
TRAIN = "training"
VAL = "testing"
BASE_PATH="Dataset"
# initialize the list of class label names
CLASSES = ["healthy_wheat","leaf_rust","stem_rust"]

# set the batch size
BATCH_SIZE = 32

# initialize the label encoder file path and the output directory to
# where the extracted features (in CSV file format) will be stored
LE_PATH = os.path.sep.join(["output", "le.cpickle"])
BASE_CSV_PATH = "output"

# set the path to the serialized model after training
MODEL_PATH = os.path.sep.join(["output", "model.cpickle"])
# import the necessary packages
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input, decode_predictions
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.preprocessing.image import load_img
from imutils import paths
import pickle
import random
# load the VGG16 network and initialize the label encoder
print("[INFO] loading network...")
model = VGG16(weights="imagenet", include_top=False)
le = None

# grab all image paths in the current split
print("[INFO] processing '{} dataset'...".format(TRAIN))
p = os.path.sep.join([BASE_PATH, TRAIN])
imagePaths = list(paths.list_images(p))

# randomly shuffle the image paths and then extract the class
# labels from the file paths
random.shuffle(imagePaths)
labels = [p.split(os.path.sep)[-2] for p in imagePaths]

# if the label encoder is None, create it
if le is None:
    le = LabelEncoder()
    le.fit(labels)

# open the output CSV file for writing
csvPath = os.path.sep.join([BASE_CSV_PATH,
    "{}.csv".format(TRAIN)])
csv = open(csvPath, "w")

# loop over the images in batches
for (b, i) in enumerate(range(0, len(imagePaths), BATCH_SIZE)):
    # extract the batch of images and labels, then initialize the
    # list of actual images that will be passed through the network
    # for feature extraction
    print("[INFO] processing batch {}/{}".format(b + 1,
        int(np.ceil(len(imagePaths) / float(BATCH_SIZE)))))
    batchPaths = imagePaths[i:i + BATCH_SIZE]
    batchLabels = le.transform(labels[i:i + BATCH_SIZE])
    batchImages = []

    # loop over the images and labels in the current batch
    for imagePath in batchPaths:
        # load the input image using the Keras helper utility
        # while ensuring the image is resized to 224x224 pixels
        image = load_img(imagePath, target_size=(224, 224))
        image = img_to_array(image)

        # preprocess the image by (1) expanding the dimensions and
        # (2) subtracting the mean RGB pixel intensity from the
        # ImageNet dataset
        image = np.expand_dims(image, axis=0)
        image = preprocess_input(image)

        # add the image to the batch
        batchImages.append(image)

    # pass the images through the network and use the outputs as
    # our actual features, then reshape the features into a
    # flattened volume
    batchImages = np.vstack(batchImages)
    features = model.predict(batchImages, batch_size=BATCH_SIZE)
    features = features.reshape((features.shape[0], 7 * 7 * 512))

    # loop over the class labels and extracted features
    for (label, vec) in zip(batchLabels, features):
        # construct a row that exists of the class label and
        # extracted features
        vec = ",".join([str(v) for v in vec])
        csv.write("{},{}\n".format(label, vec))

# close the CSV file
csv.close()

# serialize the label encoder to disk
f = open(LE_PATH, "wb")
f.write(pickle.dumps(le))
f.close()
[INFO] loading network...
[INFO] processing 'training dataset'...
[INFO] processing batch 1/28
[INFO] processing batch 2/28
[INFO] processing batch 3/28
[INFO] processing batch 4/28
[INFO] processing batch 5/28
[INFO] processing batch 6/28
[INFO] processing batch 7/28
[INFO] processing batch 8/28
[INFO] processing batch 9/28
[INFO] processing batch 10/28
[INFO] processing batch 11/28
[INFO] processing batch 12/28
[INFO] processing batch 13/28
[INFO] processing batch 14/28
[INFO] processing batch 15/28
[INFO] processing batch 16/28
[INFO] processing batch 17/28
[INFO] processing batch 18/28
[INFO] processing batch 19/28
[INFO] processing batch 20/28
[INFO] processing batch 21/28
[INFO] processing batch 22/28
[INFO] processing batch 23/28
[INFO] processing batch 24/28
[INFO] processing batch 25/28
[INFO] processing batch 26/28
[INFO] processing batch 27/28
[INFO] processing batch 28/28
# import the necessary packages
import warnings
warnings.filterwarnings("ignore") 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import numpy as np
import pickle
import os

def load_data_split(splitPath):
	# initialize the data and labels
	data = []
	labels = []

	# loop over the rows in the data split file
	for row in open(splitPath):
		# extract the class label and features from the row
		row = row.strip().split(",")
		label = row[0]
		features = np.array(row[1:], dtype="float")

		# update the data and label lists
		data.append(features)
		labels.append(label)

	# convert the data and labels to NumPy arrays
	data = np.array(data)
	labels = np.array(labels)

	# return a tuple of the data and labels
	return (data, labels)

# derive the paths to the training and testing CSV files
trainingPath = os.path.sep.join([BASE_CSV_PATH, "{}.csv".format(TRAIN)])
# testingPath = os.path.sep.join([config.BASE_CSV_PATH, "{}.csv".format(TEST)])

# load the data from disk
print("[INFO] loading data...")
(trainX, trainY) = load_data_split(trainingPath)
print(trainX.shape, trainY.shape)
# load the label encoder from disk
le = pickle.loads(open(LE_PATH, "rb").read())
[INFO] loading data...
(876, 25088) (876,)

# train the model
print("[INFO] training model...")
model_lg = LogisticRegression(solver="lbfgs", multi_class="auto")
model_lg.fit(trainX, trainY)

# evaluate the model
print("[INFO] evaluating...")
prediction_lg = model_lg.predict(trainX)
print(classification_report(trainY, prediction_lg, target_names=le.classes_))


# serialize the model to disk
print("[INFO] saving model...")
f = open(MODEL_PATH, "wb")
f.write(pickle.dumps(model_lg))
f.close()
[INFO] training model...
[INFO] evaluating...
               precision    recall  f1-score   support

healthy_wheat       1.00      1.00      1.00       142
    leaf_rust       0.99      0.99      0.99       358
    stem_rust       0.99      0.99      0.99       376

     accuracy                           1.00       876
    macro avg       1.00      1.00      1.00       876
 weighted avg       1.00      1.00      1.00       876

[INFO] saving model...
from xgboost import XGBClassifier
# fit model no training data
xgb_clf = XGBClassifier()
print('INFO: Training XGB ...')
xgb_clf.fit(trainX, trainY)

# evaluate the model
print("[INFO] evaluating in training set...")
predsTrain = xgb_clf.predict(trainX)
print(classification_report(trainY, predsTrain, target_names=le.classes_))

# serialize the model to disk
print("[INFO] saving model...")
MODEL_PATH = os.path.sep.join(["output", "model_xgb.cpickle"])
f = open(MODEL_PATH, "wb")
f.write(pickle.dumps(xgb_clf))
f.close()
INFO: Training XGB ...
[INFO] evaluating in training set...
               precision    recall  f1-score   support

healthy_wheat       1.00      1.00      1.00       142
    leaf_rust       1.00      0.99      0.99       358
    stem_rust       0.99      1.00      0.99       376

     accuracy                           1.00       876
    macro avg       1.00      1.00      1.00       876
 weighted avg       1.00      1.00      1.00       876

[INFO] saving model...
# import the necessary packages

# load the VGG16 network and initialize the label encoder
# print("[INFO] loading network...")
# model = VGG16(weights="imagenet", include_top=False)
# le = None

# grab all image paths in the current split
print("[INFO] processing '{} dataset'...".format(VAL))
p = os.path.sep.join([BASE_PATH, VAL])
imagePaths = list(paths.list_images(p))


# open the output CSV file for writing
csvPathTest = os.path.sep.join([BASE_CSV_PATH,
    "{}.csv".format(VAL)])
csvTest = open(csvPathTest, "w")

# loop over the images in batches
for (b, i) in enumerate(range(0, len(imagePaths), BATCH_SIZE)):
    # extract the batch of images and labels, then initialize the
    # list of actual images that will be passed through the network
    # for feature extraction
    print("[INFO] processing batch {}/{}".format(b + 1,
        int(np.ceil(len(imagePaths) / float(BATCH_SIZE)))))
    batchPaths = imagePaths[i:i + BATCH_SIZE]
    batchImages = []

    # loop over the images and labels in the current batch
    for imagePath in batchPaths:
        # load the input image using the Keras helper utility
        # while ensuring the image is resized to 224x224 pixels
        image = load_img(imagePath, target_size=(224, 224))
        image = img_to_array(image)

        # preprocess the image by (1) expanding the dimensions and
        # (2) subtracting the mean RGB pixel intensity from the
        # ImageNet dataset
        image = np.expand_dims(image, axis=0)
        image = preprocess_input(image)

        # add the image to the batch
        batchImages.append(image)

    # pass the images through the network and use the outputs as
    # our actual features, then reshape the features into a
    # flattened volume
    batchImages = np.vstack(batchImages)
    features = model.predict(batchImages, batch_size=BATCH_SIZE)
    features = features.reshape((features.shape[0], 7 * 7 * 512))

    # loop over the class labels and extracted features
    for vec in features:
        # construct a row that exists of the class label and
        # extracted features
        vec = ",".join([str(v) for v in vec])
        csvTest.write("{}\n".format(vec))

# close the CSV file
csvTest.close()

[INFO] processing 'testing dataset'...
[INFO] processing batch 1/20
[INFO] processing batch 2/20
[INFO] processing batch 3/20
[INFO] processing batch 4/20
[INFO] processing batch 5/20
[INFO] processing batch 6/20
[INFO] processing batch 7/20
[INFO] processing batch 8/20
[INFO] processing batch 9/20
[INFO] processing batch 10/20
[INFO] processing batch 11/20
[INFO] processing batch 12/20
[INFO] processing batch 13/20
[INFO] processing batch 14/20
[INFO] processing batch 15/20
[INFO] processing batch 16/20
[INFO] processing batch 17/20
[INFO] processing batch 18/20
[INFO] processing batch 19/20
[INFO] processing batch 20/20
[INFO] loading data...
(610, 25088)

# import the necessary packages
import warnings
warnings.filterwarnings("ignore") 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import numpy as np
import pickle
import os

def load_data_split(splitPath):
    # initialize the data and labels
    data = []
    # loop over the rows in the data split file
    for row in open(splitPath):
        # extract the class label and features from the row
        row = row.strip().split(",")
        features = np.array(row, dtype="float")
        # update the data and label lists
        data.append(features)
    
    # convert the data and labels to NumPy arrays
    data = np.array(data)
    # return a tuple of the data and labels
    return data

# derive the paths to the training and testing CSV files
testingPath = os.path.sep.join([BASE_CSV_PATH, "{}.csv".format(VAL)])

# load the data from disk
print("[INFO] loading data...")
testX = load_data_split(testingPath)
print(testX.shape)
[INFO] loading data...
(610, 25088)
import os
import pandas as pd
def create_submision(prediction,submission_name):
    test_images=[s.split('.')[0].strip() for s in os.listdir(data_dir_test)]
    print(len(test_images),test_images[:5])
    submission = pd.DataFrame({'ID':test_images})
    # create a dummy dataset
    leaf_rust = pd.Series(range(610), name="leaf_rust", dtype=np.float32)
    stem_rust = pd.Series(range(610), name="stem_rust", dtype=np.float32)
    healthy_wheat = pd.Series(range(610), name="healthy_wheat", dtype=np.float32)
    sub = pd.concat([leaf_rust, stem_rust, healthy_wheat], axis=1)
    for i in tqdm(range(0 ,len(prediction))):
        sub.loc[i] = prediction[i]
    submission = pd.concat([submission,sub], axis=1)
    submission.to_csv("submission/{}.csv".format(submission_name), index=False)
# evaluate the model
print("[INFO] evaluating...")
prediction_lg = model_lg.predict_proba(testX)
[INFO] evaluating...
create_submision(prediction_lg,'vggnet_feature_extraction_lg')
610 ['008FWT', '00AQXY', '01OJZX', '07OXKK', '085IEC']


100%|██████████████████████████████████████████████████████████████████████████████| 610/610 [00:00<00:00, 1091.22it/s]
# evaluate the model
print("[INFO] evaluating...")
prediction_xgb = xgb_clf.predict_proba(testX)
[INFO] evaluating...
create_submision(prediction_xgb,'vggnet_feature_extraction_xgb')
610 ['008FWT', '00AQXY', '01OJZX', '07OXKK', '085IEC']


100%|██████████████████████████████████████████████████████████████████████████████| 610/610 [00:00<00:00, 1153.11it/s]
from sklearn.svm import SVC
svm_clf = SVC(gamma='auto')
print("[INFO] Training SVM...")
svm_clf.fit(trainX, trainY)

# evaluate the model
print("[INFO] evaluating...")
preds_svm = svm_clf.predict_proba(trainX)
print(classification_report(trainY, preds_svm, target_names=le.classes_))

# serialize the model to disk
print("[INFO] saving model...")
MODEL_PATH = os.path.sep.join(["output", "model_svm.cpickle"])
f = open(MODEL_PATH, "wb")
f.write(pickle.dumps(svm_clf))
f.close()
[INFO] Training SVM...
[INFO] evaluating...
               precision    recall  f1-score   support

healthy_wheat       1.00      1.00      1.00       142
    leaf_rust       0.99      1.00      0.99       358
    stem_rust       1.00      0.99      0.99       376

     accuracy                           1.00       876
    macro avg       1.00      1.00      1.00       876
 weighted avg       1.00      1.00      1.00       876

[INFO] saving model...