Imports¶

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, confusion_matrix, r2_score, accuracy_score, recall_score, precision_score, classification_report
from sklearn.pipeline import make_pipeline

Load Data¶

# load data
data = pd.read_csv('../data/ALL_DATA.csv')
print(data.columns)


print(data.head())
print(np.shape(data))

print('\n--------\n')

# Turn into Dataframe
print("Converted to Dataframe: \n")
all_data = pd.DataFrame(data=data)
print(all_data)

Index(['id', 'class', 'mass', 'fall', 'year', 'lat', 'long', 'Elevation'], dtype='object')
      id        class      mass  fall    year       lat       long   Elevation
0    1.0           L5      21.0  Fell  1880.0  50.77500    6.08333  333.548386
1    2.0           H6     720.0  Fell  1951.0  56.18333   10.23333  333.548386
2    6.0          EH4  107000.0  Fell  1952.0  54.21667 -113.00000  333.548386
3   10.0  Acapulcoite    1914.0  Fell  1976.0  16.88333  -99.90000  333.548386
4  370.0           L6     780.0  Fell  1902.0 -33.16667  -64.95000  333.548386
(88705, 8)

--------

Converted to Dataframe: 

            id        class           mass  fall    year        lat  \
0          1.0           L5      21.000000  Fell  1880.0  50.775000   
1          2.0           H6     720.000000  Fell  1951.0  56.183330   
2          6.0          EH4  107000.000000  Fell  1952.0  54.216670   
3         10.0  Acapulcoite    1914.000000  Fell  1976.0  16.883330   
4        370.0           L6     780.000000  Fell  1902.0 -33.166670   
...        ...          ...            ...   ...     ...        ...   
88700  75686.0          NaN   13278.078549   NaN  2005.0 -38.000000   
88701  75687.0          NaN   13278.078549   NaN  2005.0 -38.000000   
88702  75691.0          NaN   13278.078549   NaN  2005.0 -36.816110   
88703  75692.0          NaN   13278.078549   NaN  2005.0 -36.816110   
88704  78769.0          NaN   13278.078549   NaN  2005.0  53.033333   

             long   Elevation  
0        6.083330  333.548386  
1       10.233330  333.548386  
2     -113.000000  333.548386  
3      -99.900000  333.548386  
4      -64.950000  333.548386  
...           ...         ...  
88700  145.000000  100.000000  
88701  145.000000  100.000000  
88702  144.680991  233.000000  
88703  144.680991  233.000000  
88704   20.716667  150.000000  

[88705 rows x 8 columns]

Scaling (10%)¶

# *****************************
# HIGHLY IMPORTANT
# *****************************

# Sample data
print("Original Data Stats: \n")
print(all_data.describe())

print('\n--------\n')

print("New Sample Data Stats: \n")
all_data = all_data.sample(frac=0.1)  # 10% sample set
print(all_data.describe())

Original Data Stats: 

                 id          mass          year           lat          long  \
count  88705.000000  8.870500e+04  88705.000000  88705.000000  88705.000000   
mean   37088.348053  1.327808e+04   1999.326928      2.815932     38.871848   
std    23580.335315  4.121868e+05     21.440582     50.503054     70.303823   
min        1.000000  0.000000e+00    301.000000    -87.366670   -174.833333   
25%    16281.000000  3.000000e+01   1998.000000    -38.630000      8.192500   
50%    34337.000000  1.200000e+04   2002.000000     23.000000     26.000000   
75%    56576.000000  1.327808e+04   2008.000000     45.968889     58.407850   
max    80694.000000  6.000000e+07   2501.000000     82.569167    354.473330   

          Elevation  
count  88705.000000  
mean     333.548386  
std      375.385323  
min      -11.000000  
25%      100.000000  
50%      333.548386  
75%      333.548386  
max     9999.000000  

--------

New Sample Data Stats: 

                 id          mass         year          lat         long  \
count   8870.000000  8.870000e+03  8870.000000  8870.000000  8870.000000   
mean   37021.583766  8.026057e+03  1999.107155     2.557869    39.658546   
std    23575.968913  3.562619e+04    26.576604    51.054846    69.942534   
min        4.000000  0.000000e+00   301.000000   -86.716670  -165.116670   
25%    16487.000000  2.760000e+01  1998.000000   -71.500000     8.649167   
50%    33808.500000  9.827500e+03  2002.000000    23.254845    26.000000   
75%    56581.750000  1.327808e+04  2008.000000    46.511111    59.683333   
max    80674.000000  2.753000e+06  2101.000000    73.883333   178.050000   

         Elevation  
count  8870.000000  
mean    327.935051  
std     357.428963  
min      -3.000000  
25%     100.000000  
50%     333.548386  
75%     333.548386  
max    3865.000000

Target and Feature Variables¶

# target variable
target = data.year
# features
features = data.drop(['year'], axis=1)


# print(data.head())
features.columns

Index(['id', 'class', 'mass', 'fall', 'lat', 'long', 'Elevation'], dtype='object')

Encoding (One-Hot)¶

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# ### Categorical data to be converted to numeric data
# class_data = list(all_data['class'])
# fall_data = list(all_data['fall'])

class_data = list(features['class'])
fall_data = list(features['fall'])

### integer mapping using LabelEncoder
le = LabelEncoder()
class_encoded = le.fit_transform(class_data)
fall_encoded = le.fit_transform(fall_data)
class_encoded = class_encoded.reshape(len(class_encoded), 1)
fall_encoded = fall_encoded.reshape(len(fall_encoded), 1)

### One hot encoding
onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoded_class = onehot_encoder.fit_transform(class_encoded)
onehot_encoded_fall = onehot_encoder.fit_transform(fall_encoded)

# print(onehot_encoded_class)
# all_data['class'] = onehot_encoded_class
# print(all_data['class'])

# print('\n\n\n')
# print(onehot_encoded_fall)
# all_data['fall'] = onehot_encoded_fall
# print(all_data['fall'])

print(onehot_encoded_class)
features['class'] = onehot_encoded_class
print(features['class'])

print('\n\n\n')
print(onehot_encoded_fall)
features['fall'] = onehot_encoded_fall
print(features['fall'])

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]
0        0.0
1        0.0
2        0.0
3        1.0
4        0.0
        ... 
88700    0.0
88701    0.0
88702    0.0
88703    0.0
88704    0.0
Name: class, Length: 88705, dtype: float64


[[1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 ...
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]]
0        1.0
1        1.0
2        1.0
3        1.0
4        1.0
        ... 
88700    0.0
88701    0.0
88702    0.0
88703    0.0
88704    0.0
Name: fall, Length: 88705, dtype: float64

Build Model¶

def data_model(data, target):  #x,y
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.25, random_state=0)
    pipeline = make_pipeline(LogisticRegression())
    model = pipeline.fit(X_train, y_train)
    
    return (X_test, y_test, model)

# print("Value distribution of features: ")
# print(list(all_data.iloc[0]))

# min_max = MinMaxScaler()
# data_min_max = min_max.fit_transform(all_data)
# print('\n')
# print("Value distribution after min max: ")
# print(list(data_min_max[0]))

# std = StandardScaler()
# data_std = std.fit_transform(all_data)
# print('\n')
# print("Value distribution after std: ")
# print(list(data_std[0]))

print("Value distribution of features: ")
print(list(features.iloc[0]))

min_max = MinMaxScaler()
data_min_max = min_max.fit_transform(features)
print('\n')
print("Value distribution after min max: ")
print(list(data_min_max[0]))

std = StandardScaler()
data_std = std.fit_transform(features)
print('\n')
print("Value distribution after std: ")
print(list(data_std[0]))

Value distribution of features: 
[1.0, 0.0, 21.0, 1.0, 50.775, 6.08333, 333.54838634618517]


Value distribution after min max: 
[0.0, 0.0, 3.5e-07, 1.0, 0.8129048746259518, 0.3417993308340014, 0.03442041821640212]


Value distribution after std: 
[-1.5728172058230052, -0.024680562891811992, -0.032162975554411624, 8.895559827418387, 0.9496324358654531, -0.466385781116156, 0.0]

Model Evaluation¶

#Data Variable
# x = all_data[all_data.columns[0]]
x = features
print(x)
#Target Variable
# y = all_data[all_data.columns[1]]
y = target
print(y)

            id  class           mass  fall        lat        long   Elevation
0          1.0    0.0      21.000000   1.0  50.775000    6.083330  333.548386
1          2.0    0.0     720.000000   1.0  56.183330   10.233330  333.548386
2          6.0    0.0  107000.000000   1.0  54.216670 -113.000000  333.548386
3         10.0    1.0    1914.000000   1.0  16.883330  -99.900000  333.548386
4        370.0    0.0     780.000000   1.0 -33.166670  -64.950000  333.548386
...        ...    ...            ...   ...        ...         ...         ...
88700  75686.0    0.0   13278.078549   0.0 -38.000000  145.000000  100.000000
88701  75687.0    0.0   13278.078549   0.0 -38.000000  145.000000  100.000000
88702  75691.0    0.0   13278.078549   0.0 -36.816110  144.680991  233.000000
88703  75692.0    0.0   13278.078549   0.0 -36.816110  144.680991  233.000000
88704  78769.0    0.0   13278.078549   0.0  53.033333   20.716667  150.000000

[88705 rows x 7 columns]
0        1880.0
1        1951.0
2        1952.0
3        1976.0
4        1902.0
          ...  
88700    2005.0
88701    2005.0
88702    2005.0
88703    2005.0
88704    2005.0
Name: year, Length: 88705, dtype: float64

# print("Base:")
# X_test, y_test, model = data_model(x, y)
# prediction = model.predict(X_test)
# print("MSE: {}".format(mean_squared_error(y_test, prediction)))
# print("R Squared: {}".format(r2_score(y_test, prediction)))
# print('\n')


print("MinMax:")
X_test, y_test, model = data_model(data_min_max, y)
prediction = model.predict(X_test)
print("MSE: {}".format(mean_squared_error(y_test, prediction)))
print("R Squared: {}".format(r2_score(y_test, prediction)))
print("Confusion Matrix: {}".format(confusion_matrix(y_test, prediction.round())))
print("Accuracy: {}".format(accuracy_score(y_test, prediction.round(), normalize=False)))
print("Recall Score: {}".format(recall_score(y_test, prediction.round(), average=None)))
print("Precision Score: {}".format(precision_score(y_test, prediction.round(), average=None)))
print("Classification Report: {}".format(classification_report(y_test, prediction.round())))
print('\n')



print("Std:")
X_test, y_test, model = data_model(data_std, y)
prediction = model.predict(X_test)
print("MSE: {}".format(mean_squared_error(y_test, prediction)))
print("R Squared: {}".format(r2_score(y_test, prediction)))
print("Confusion Matrix: {}".format(confusion_matrix(y_test, prediction.round())))
print("Accuracy: {}".format(accuracy_score(y_test, prediction.round(), normalize=False)))
print("Recall Score: {}".format(recall_score(y_test, prediction.round(), average=None)))
print("Precision Score: {}".format(precision_score(y_test, prediction.round(), average=None)))
print("Classification Report: {}".format(classification_report(y_test, prediction.round())))

MinMax:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-35-1d22f10f8f03> in <module>
      8 
      9 print("MinMax:")
---> 10 X_test, y_test, model = data_model(data_min_max, y)
     11 prediction = model.predict(X_test)
     12 print("MSE: {}".format(mean_squared_error(y_test, prediction)))

<ipython-input-17-b92d6064f633> in data_model(data, target)
      2     X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.25, random_state=0)
      3     pipeline = make_pipeline(LogisticRegression())
----> 4     model = pipeline.fit(X_train, y_train)
      5 
      6     return (X_test, y_test, model)

/usr/local/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    354                                  self._log_message(len(self.steps) - 1)):
    355             if self._final_estimator != 'passthrough':
--> 356                 self._final_estimator.fit(Xt, y, **fit_params)
    357         return self
    358 

/usr/local/lib/python3.7/site-packages/sklearn/linear_model/logistic.py in fit(self, X, y, sample_weight)
   1531         X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype, order="C",
   1532                          accept_large_sparse=solver != 'liblinear')
-> 1533         check_classification_targets(y)
   1534         self.classes_ = np.unique(y)
   1535         n_samples, n_features = X.shape

/usr/local/lib/python3.7/site-packages/sklearn/utils/multiclass.py in check_classification_targets(y)
    167     if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
    168                       'multilabel-indicator', 'multilabel-sequences']:
--> 169         raise ValueError("Unknown label type: %r" % y_type)
    170 
    171 

ValueError: Unknown label type: 'continuous'

Stop here as this model is meant for classification not regression¶