Data Science Projects

Abebawu Eshetu

Author: Abebawu Eshetu

Research Interest: Natural Language Processing, Machine Learning, and Computer Vision for Social Goods.

import skimage
from lxml import etree
import os ## for OS level file processing such as file directory reading
import glob ## to match file name pattern for files on defined directory
from sklearn.model_selection import train_test_split ## to split train, test, and validation sets of dataset
import numpy as np ## for array processing
from skimage import io
from scipy import misc
import get_patches
from sklearn import metrics
import detectobjects as det
import cv2

## directories to load images
MALARIA_DIR='malaria/images/' #'plasmodium/'#
TUBERCLOSIS_DIR='tuberculosis/'
INTESTINAL_PARSITE_DIR='intestinalparasites/'
def read_and_split_data(img_dir, train_split=0.6, test_split=0.2, val_split=0.2):
    '''Split a list of image files into training, testing and validation sets.'''

    imgfilenames = glob.glob(img_dir + '*.jpg')
    baseimgfilenames = [os.path.basename(f) for f in imgfilenames]
    all_images=np.arange(len(baseimgfilenames))
    if val_split>0:
        train,val = train_test_split(all_images,
                                           train_size=train_split+test_split,
                                           test_size=val_split,
                                           random_state=1)
        train,test=train_test_split(train,
                                  train_size=1-test_split,
                                  test_size=test_split,
                                  random_state=1)
    else:
        val=[]
        train,test = train_test_split(all_images,
                                  train_size=train_split + val_split,
                                  test_size=test_split,
                                  random_state=1)

    trainfiles = [baseimgfilenames[i] for i in train]
    testfiles = [baseimgfilenames[i] for i in test]
    valfiles = [baseimgfilenames[i] for i in val]

    return trainfiles, valfiles,testfiles

train,test,val=read_and_split_data(MALARIA_DIR)
print(len(train), len(test), len(val))
##Prepare bound boxes with posetive and negative classess for training and testing
if os.path.isfile('malaria/bounds/train_X.npy'):
    test_y=np.load('malaria/bounds/test_y.npy')
    test_X=np.load('malaria/bounds/test_X.npy')
    train_X=np.load('malaria/bounds/train_X.npy')
    train_y=np.load('malaria/bounds/train_y.npy')
else:
    opts = {'img_dir': 'malaria/images/',
            'annotation_dir': 'malaria/annotation/',
            'detection_probability_threshold': 0.5,
            'detection_overlap_threshold': 0.3, 
            'gauss': 1,
            'patch_size': (40,40),
            'image_downsample' : 2,
            'detection_step': 5,
            'patch_creation_step': 40,
            'object_class': None,
            'negative_training_discard_rate': .9
           }
    opts['patch_stride_training'] = int(opts['patch_size'][0]*0.25)
    trainfiles, valfiles, testfiles = get_patches.create_sets(opts['img_dir'], train_set_proportion=.5, 
                                                      test_set_proportion=.5,
                                                      val_set_proportion=0)
    print(len(trainfiles), len(valfiles), len(testfiles))
    print('Creating patches ....')
    train_y, train_X = get_patches.create_patches(trainfiles, opts['annotation_dir'], opts['img_dir'], opts['patch_size'][0],
                                                  opts['patch_stride_training'], grayscale=False, progressbar=True, 
                                                  downsample=opts['image_downsample'], objectclass=opts['object_class'],
                                                  negative_discard_rate=opts['negative_training_discard_rate'])
    test_y, test_X = get_patches.create_patches(testfiles,  opts['annotation_dir'], opts['img_dir'], opts['patch_size'][0],
                                                opts['patch_stride_training'], grayscale=False, progressbar=True, 
                                                downsample=opts['image_downsample'], objectclass=opts['object_class'],
                                                negative_discard_rate=opts['negative_training_discard_rate'])

    # Cut down on disproportionately large numbers of negative patches
    train_X, train_y = get_patches.balance(train_X, train_y, mult_neg=200)

    # Create rotated and flipped versions of the positive patches
    train_X, train_y = get_patches.augment_positives(train_X, train_y)
    test_X, test_y = get_patches.augment_positives(test_X, test_y)
    
    np.save('malaria/bounds/train_X.npy', np.array(train_X), allow_pickle=True)
    np.save('malaria/bounds/train_y.npy', np.array(train_y), allow_pickle=True)
    np.save('malaria/bounds/test_X.npy', np.array(test_X), allow_pickle=True)
    np.save('malaria/bounds/test_y.npy', np.array(test_y), allow_pickle=True)
    
print('\n')
print('%d positive training examples, %d negative training examples' % (sum(train_y), len(train_y)-sum(train_y)))
print('%d positive testing examples, %d negative testing examples' % (sum(test_y), len(test_y)-sum(test_y)))
print('%d patches (%.1f%% positive)' % (len(train_y)+len(test_y), 100.*((sum(train_y)+sum(test_y))/(len(train_y)+len(test_y)))))
187728 positive training examples, 782967 negative training examples
198312 positive testing examples, 775234 negative testing examples
1944241 patches (19.9% positive)
def CNN(n_epochs):
    net1 = NeuralNet(
        layers=[
        ('input', layers.InputLayer),
        ('conv1', layers.Conv2DLayer),      #Convolutional layer.  Params defined below
        ('pool1', layers.MaxPool2DLayer),   # Like downsampling, for execution speed
        ('conv2', layers.Conv2DLayer),
        ('hidden3', layers.DenseLayer),
        ('output', layers.DenseLayer),
        ],
        
    input_shape=(None, 3, opts['patch_size'][0]/opts['image_downsample'], 
                 opts['patch_size'][0]/opts['image_downsample']),
    conv1_num_filters=7, 
    conv1_filter_size=(3, 3), 
    conv1_nonlinearity=lasagne.nonlinearities.rectify,
        
    pool1_pool_size=(2, 2),
        
    conv2_num_filters=12, 
    conv2_filter_size=(2, 2),    
    conv2_nonlinearity=lasagne.nonlinearities.rectify,
        
    hidden3_num_units=500,
    output_num_units=2, 
    output_nonlinearity=lasagne.nonlinearities.softmax,

    update_learning_rate=0.0001,
    update_momentum=0.9,

    max_epochs=n_epochs,
    verbose=1,
    )
    return net1

cnn = CNN(50).fit(train_X, train_y)
"""This file is for patches classification (step 1).

Sample predictive model.
You must supply at least 4 methods:
- fit: trains the model.
- predict: uses the model to perform predictions.
- save: saves the model.
- load: reloads the model.
"""
import numpy as np   # We recommend to use numpy arrays
from sklearn.base import BaseEstimator
from resnet import ResNet
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.layers import Activation, Dropout, Flatten, Dense
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.callbacks import LearningRateScheduler
import tensorflow as tf

class model(BaseEstimator):
    """Main class for Classification problem."""

    def __init__(self):
        """Init method.

        """
        self.num_train_samples = 0
        self.num_feat = 1
        self.num_labels = 1
        self.is_trained = False

        self.model=ResNet.build(40, 40, 3, 2, (1, 2),(64, 32, 64, 128), reg=0.0005)
        self.model.compile(loss='binary_crossentropy', optimizer='adam',
                           metrics=['accuracy'])
        self.model.summary()

    def poly_decay(self,epoch):
        NUM_EPOCHS = 5
        INIT_LR = 1e-1
        BS = 16
        maxEpochs = NUM_EPOCHS
        baseLR = INIT_LR
        power = 1.0
        # compute the new learning rate based on polynomial decay
        alpha = baseLR * (1 - (epoch / float(maxEpochs))) ** power
        # return the new learning rate
        return alpha

    def fit(self, X, y):
        """Fit method.

        This function should train the model parameters.
        Args:
            X: Training data matrix of dim num_train_samples * num_feat.
               An image has the following shape (40, 40, 3) then 4800 features.
            y: Training label matrix of dim num_train_samples.
        Both inputs are numpy arrays.
        """
        self.num_train_samples = X.shape[0]
        batch_size=32
        X = X.reshape((self.num_train_samples, 40, 40, 3))
        # initialize the training data augmentation object
        trainAug = tf.keras.preprocessing.image.ImageDataGenerator(
            rescale=1 / 255.0,
            rotation_range=20,
            zoom_range=0.05,
            width_shift_range=0.05,
            height_shift_range=0.05,
            shear_range=0.05,
            horizontal_flip=True,
            fill_mode="nearest")
        # initialize the training generator
        train_generator = trainAug.flow(
            X,
            y,
            shuffle=True,
            batch_size=batch_size)
        #checkpoint
        callbacks = [LearningRateScheduler(self.poly_decay)]#
        
        self.model.fit_generator(train_generator,
            steps_per_epoch=X.shape[0] // batch_size,
            epochs=5, callbacks=callbacks)
        self.is_trained = True

    def predict(self, X):

        num_test_samples = X.shape[0]
        X = X.reshape((num_test_samples, 40, 40, 3))
        # initialize the validation (and testing) data augmentation object
        testAug = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1 / 255.0)
        # initialize the testing generator
        testGen = testAug.flow(
            X,
            shuffle=False,
            batch_size=32)
        # testGen.reset()
        return self.model.predict_generator(testGen)

# reload(det)

fname = testfiles[1]
imfile = opts['img_dir'] + fname
opts['detection_probability_threshold'] = 0.95

found = det.detect(imfile, cnn, opts)

im = misc.imread(imfile)

plt.box(False)
plt.xticks([])
plt.yticks([])

annofile = opts['annotation_dir'] + fname[:-3] + 'xml'
bboxes = readdata.get_bounding_boxes_for_single_image(annofile)
for bb in bboxes:
    bb = bb.astype(int)
    cv2.rectangle(im, (bb[0],bb[2]), (bb[1],bb[3]), (255,255,255), 2)  

for f in found:
    f = f.astype(int)
    cv2.rectangle(im, (f[0],f[1]), (f[2],f[3]), (255,0,0), 2)

plt.gcf().set_size_inches(10,10)
plt.title('Detected objects in %s' % (imfile))
plt.imshow(im)

#cv2.imwrite('detectionimages/detected-' + os.path.basename(imfile),im)
---------------------------------------------------------------------------

NameError                                 Traceback (most recent call last)

<ipython-input-14-4b406515f3b6> in <module>
      5 opts['detection_probability_threshold'] = 0.95
      6 
----> 7 found = det.detect(imfile, cnn, opts)
      8 
      9 im = misc.imread(imfile)


NameError: name 'cnn' is not defined
import matplotlib.pyplot as plt ## for plotting data
import seaborn as sns ## another library to visualize data features

N_samples_to_display = 10
pos_indices = np.where(train_y)[0]
pos_indices = pos_indices[np.random.permutation(len(pos_indices))]
for i in range(N_samples_to_display):
    plt.subplot(2,N_samples_to_display,i+1)
    example_pos = train_X[pos_indices[i],:,:,:]
    example_pos = np.swapaxes(example_pos,0,2)
    plt.axis('off')
    plt.imshow(example_pos)
#     cv2.rectangle(img_bbox,(xmin,ymin),(xmax,ymax),(0,255,0),2)
#     font = cv2.FONT_HERSHEY_SIMPLEX
#     cv2.putText(img_bbox,class_name,(xmin,ymin-10), font, 1,(0,255,0),2)
# plt.subplot(1,2,2)
# plt.title('Image with Bounding Box')
# plt.imshow(img_bbox)
# plt.show()

neg_indices = np.where(train_y==0)[0]
neg_indices = neg_indices[np.random.permutation(len(neg_indices))]
for i in range(N_samples_to_display,2*N_samples_to_display):
    plt.subplot(2,N_samples_to_display,i+1)
    example_neg = train_X[neg_indices[i],:,:,:]
    example_neg = np.swapaxes(example_neg,0,2)
    plt.axis('off')
    plt.imshow(example_neg)
    

plt.gcf().set_size_inches(1.5*N_samples_to_display,3)

png

In the above plot, we the first 10 shows posetively labeled i.e., cell infected and the second 10 samples shows negatively labeled means non-infected box that is randomly selected bound from blod-smear image which does not overlap expert annotated bounding box.