Imports

In [31]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, confusion_matrix, r2_score, accuracy_score, recall_score, precision_score, classification_report
from sklearn.pipeline import make_pipeline
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, GlobalMaxPooling1D, SpatialDropout1D

Load Data

In [5]:
# load data for nasa
data = pd.read_csv('../data/ALL_DATA.csv')
print(data.columns)


print(data.head())
print(np.shape(data))
Index(['id', 'class', 'mass', 'fall', 'year', 'lat', 'long', 'Elevation'], dtype='object')
      id        class      mass  fall    year       lat       long   Elevation
0    1.0           L5      21.0  Fell  1880.0  50.77500    6.08333  333.548386
1    2.0           H6     720.0  Fell  1951.0  56.18333   10.23333  333.548386
2    6.0          EH4  107000.0  Fell  1952.0  54.21667 -113.00000  333.548386
3   10.0  Acapulcoite    1914.0  Fell  1976.0  16.88333  -99.90000  333.548386
4  370.0           L6     780.0  Fell  1902.0 -33.16667  -64.95000  333.548386
(88705, 8)
/Users/aleia/Library/Python/3.7/lib/python/site-packages/IPython/core/interactiveshell.py:3063: DtypeWarning: Columns (1,3) have mixed types.Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

Scaling (10%)

In [6]:
# *****************************
# HIGHLY IMPORTANT
# *****************************

# Sample data
print("Original Data Stats: \n")
print(data.describe())

print('\n--------\n')

print("New Sample Data Stats: \n")
# all_data['year'].fillna(0).astype(int)
# all_data['mass'].fillna(0).astype(int)


data = data.sample(frac=0.1)  # 10% sample set
print(data.describe())
Original Data Stats: 

                 id          mass          year           lat          long  \
count  88705.000000  8.870500e+04  88705.000000  88705.000000  88705.000000   
mean   37088.348053  1.327808e+04   1999.326928      2.815932     38.871848   
std    23580.335315  4.121868e+05     21.440582     50.503054     70.303823   
min        1.000000  0.000000e+00    301.000000    -87.366670   -174.833333   
25%    16281.000000  3.000000e+01   1998.000000    -38.630000      8.192500   
50%    34337.000000  1.200000e+04   2002.000000     23.000000     26.000000   
75%    56576.000000  1.327808e+04   2008.000000     45.968889     58.407850   
max    80694.000000  6.000000e+07   2501.000000     82.569167    354.473330   

          Elevation  
count  88705.000000  
mean     333.548386  
std      375.385323  
min      -11.000000  
25%      100.000000  
50%      333.548386  
75%      333.548386  
max     9999.000000  

--------

New Sample Data Stats: 

                 id          mass         year          lat         long  \
count   8870.000000  8.870000e+03  8870.000000  8870.000000  8870.000000   
mean   36754.965614  1.434879e+04  1999.313735     2.752044    39.549402   
std    23464.883540  3.811169e+05    19.264951    50.735250    70.203357   
min       11.000000  0.000000e+00  1621.000000   -87.366670  -165.116670   
25%    16230.000000  2.874500e+01  1998.000000   -71.500000     8.662325   
50%    33647.000000  1.000000e+04  2002.000000    23.361920    26.000000   
75%    56099.000000  1.327808e+04  2008.000000    46.260556    73.333333   
max    80692.000000  2.600000e+07  2020.000000    82.569167   178.050000   

         Elevation  
count  8870.000000  
mean    326.449900  
std     368.702903  
min      -3.000000  
25%     100.000000  
50%     333.548386  
75%     333.548386  
max    9999.000000  
In [7]:
all_data = pd.DataFrame(data=data)
print(all_data)

# target variable
target = data.year
# features
features = data.drop(['year'], axis=1)


# print(data.head())
features.columns
            id      class          mass   fall    year        lat        long  \
44557  38928.0         H6      0.960000  Found  1998.0   0.000000    0.000000   
86513  66287.0        NaN  13278.078549    NaN  2011.0  41.854444   14.074167   
77675  60327.0        NaN  13278.078549    NaN  2007.0  30.469444  119.594722   
72203  56112.0        NaN  13278.078549    NaN  2002.0  47.650000   18.300000   
15263  50394.0         L5      4.100000  Found  2006.0 -72.780830   75.315560   
...        ...        ...           ...    ...     ...        ...         ...   
31048  44836.0  Relict OC  13278.078549  Found  1996.0  58.583330   13.433330   
88320  41812.0        NaN  13278.078549    NaN  2005.0  52.459444   12.963889   
39048  25008.0         H6      3.970000  Found  1974.0   0.000000   35.666670   
13794  49818.0         H5      0.450000  Found  2003.0 -72.983610   75.245830   
79761  54881.0        NaN  13278.078549    NaN  2006.0  41.500000   25.166667   

         Elevation  
44557   333.548386  
86513  1227.000000  
77675     0.000000  
72203   120.000000  
15263   333.548386  
...            ...  
31048   333.548386  
88320     0.000000  
39048   333.548386  
13794   333.548386  
79761   950.000000  

[8870 rows x 8 columns]
Out[7]:
Index(['id', 'class', 'mass', 'fall', 'lat', 'long', 'Elevation'], dtype='object')

One Hot Encoding

In [8]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# ### Categorical data to be converted to numeric data
# class_data = list(all_data['class'])
# fall_data = list(all_data['fall'])

class_data = list(features['class'])
fall_data = list(features['fall'])


### integer mapping using LabelEncoder
le = LabelEncoder()
class_encoded = le.fit_transform(class_data)
fall_encoded = le.fit_transform(fall_data)
class_encoded = class_encoded.reshape(len(class_encoded), 1)
fall_encoded = fall_encoded.reshape(len(fall_encoded), 1)

### One hot encoding
onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoded_class = onehot_encoder.fit_transform(class_encoded)
onehot_encoded_fall = onehot_encoder.fit_transform(fall_encoded)

# print(onehot_encoded_class)
# all_data['class'] = onehot_encoded_class
# print(all_data['class'])

# print('\n\n\n')
# print(onehot_encoded_fall)
# all_data['fall'] = onehot_encoded_fall
# print(all_data['fall'])

print(onehot_encoded_class)
features['class'] = onehot_encoded_class
print(features['class'])

print('\n\n\n')
print(onehot_encoded_fall)
features['fall'] = onehot_encoded_fall
print(features['fall'])
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]]
44557    0.0
86513    0.0
77675    0.0
72203    0.0
15263    0.0
        ... 
31048    0.0
88320    0.0
39048    0.0
13794    0.0
79761    0.0
Name: class, Length: 8870, dtype: float64




[[0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 ...
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
44557    0.0
86513    0.0
77675    0.0
72203    0.0
15263    0.0
        ... 
31048    0.0
88320    0.0
39048    0.0
13794    0.0
79761    0.0
Name: fall, Length: 8870, dtype: float64
/usr/local/lib/python3.7/site-packages/sklearn/preprocessing/_encoders.py:415: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.
If you want the future behaviour and silence this warning, you can specify "categories='auto'".
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
  warnings.warn(msg, FutureWarning)
/usr/local/lib/python3.7/site-packages/sklearn/preprocessing/_encoders.py:415: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.
If you want the future behaviour and silence this warning, you can specify "categories='auto'".
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
  warnings.warn(msg, FutureWarning)
In [9]:
# print(all_data.head)
print(features.head)
<bound method NDFrame.head of             id  class          mass  fall        lat        long    Elevation
44557  38928.0    0.0      0.960000   0.0   0.000000    0.000000   333.548386
86513  66287.0    0.0  13278.078549   0.0  41.854444   14.074167  1227.000000
77675  60327.0    0.0  13278.078549   0.0  30.469444  119.594722     0.000000
72203  56112.0    0.0  13278.078549   0.0  47.650000   18.300000   120.000000
15263  50394.0    0.0      4.100000   0.0 -72.780830   75.315560   333.548386
...        ...    ...           ...   ...        ...         ...          ...
31048  44836.0    0.0  13278.078549   0.0  58.583330   13.433330   333.548386
88320  41812.0    0.0  13278.078549   0.0  52.459444   12.963889     0.000000
39048  25008.0    0.0      3.970000   0.0   0.000000   35.666670   333.548386
13794  49818.0    0.0      0.450000   0.0 -72.983610   75.245830   333.548386
79761  54881.0    0.0  13278.078549   0.0  41.500000   25.166667   950.000000

[8870 rows x 7 columns]>

Scaling

In [10]:
print("Value distribution of features: ")
print(list(features.iloc[0]))

min_max = MinMaxScaler(feature_range = (0, 1))
data_min_max = min_max.fit_transform(features)
print('\n')
print("Value distribution after min max: ")
print(list(data_min_max[0]))
Value distribution of features: 
[38928.0, 0.0, 0.96, 0.0, 0.0, 0.0, 333.54838634618517]


Value distribution after min max: 
[0.4823564407977095, 0.0, 3.692307692307692e-08, 0.0, 0.5141156316036783, 0.4811559059625458, 0.03364810901281595]

Bar Plot of year

In [12]:
plt.figure(figsize=(10,5))
plt.title('Year')
count = target.value_counts()
print(count)
plt.plot(count)
2000.0    712
1999.0    685
2003.0    516
2001.0    476
2006.0    403
         ... 
1832.0      1
1882.0      1
1840.0      1
1819.0      1
1834.0      1
Name: year, Length: 158, dtype: int64
Out[12]:
[<matplotlib.lines.Line2D at 0x111b75f90>]

Build Model

In [27]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=0)
model = Sequential()
model.add(LSTM(50, return_sequences=True))
model.add(LSTM(50, return_sequences=False))
model.add(Dense(25))
model.add(Dense(1))

Compile Model

In [28]:
model.compile(optimizer='adam', loss='mean_squared_error')

Train Model

In [29]:
model.fit(X_train, y_train, batch_size=1, epochs=1)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-29-8e7061fb8359> in <module>
----> 1 model.fit(X_train, y_train, batch_size=1, epochs=1)

/usr/local/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py in _method_wrapper(self, *args, **kwargs)
    106   def _method_wrapper(self, *args, **kwargs):
    107     if not self._in_multi_worker_mode():  # pylint: disable=protected-access
--> 108       return method(self, *args, **kwargs)
    109 
    110     # Running inside `run_distribute_coordinator` already.

/usr/local/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
   1096                 batch_size=batch_size):
   1097               callbacks.on_train_batch_begin(step)
-> 1098               tmp_logs = train_function(iterator)
   1099               if data_handler.should_sync:
   1100                 context.async_wait()

/usr/local/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
    778       else:
    779         compiler = "nonXla"
--> 780         result = self._call(*args, **kwds)
    781 
    782       new_tracing_count = self._get_tracing_count()

/usr/local/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
    821       # This is the first call of __call__, so we have to initialize.
    822       initializers = []
--> 823       self._initialize(args, kwds, add_initializers_to=initializers)
    824     finally:
    825       # At this point we know that the initialization is complete (or less

/usr/local/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in _initialize(self, args, kwds, add_initializers_to)
    695     self._concrete_stateful_fn = (
    696         self._stateful_fn._get_concrete_function_internal_garbage_collected(  # pylint: disable=protected-access
--> 697             *args, **kwds))
    698 
    699     def invalid_creator_scope(*unused_args, **unused_kwds):

/usr/local/lib/python3.7/site-packages/tensorflow/python/eager/function.py in _get_concrete_function_internal_garbage_collected(self, *args, **kwargs)
   2853       args, kwargs = None, None
   2854     with self._lock:
-> 2855       graph_function, _, _ = self._maybe_define_function(args, kwargs)
   2856     return graph_function
   2857 

/usr/local/lib/python3.7/site-packages/tensorflow/python/eager/function.py in _maybe_define_function(self, args, kwargs)
   3211 
   3212       self._function_cache.missed.add(call_context_key)
-> 3213       graph_function = self._create_graph_function(args, kwargs)
   3214       self._function_cache.primary[cache_key] = graph_function
   3215       return graph_function, args, kwargs

/usr/local/lib/python3.7/site-packages/tensorflow/python/eager/function.py in _create_graph_function(self, args, kwargs, override_flat_arg_shapes)
   3073             arg_names=arg_names,
   3074             override_flat_arg_shapes=override_flat_arg_shapes,
-> 3075             capture_by_value=self._capture_by_value),
   3076         self._function_attributes,
   3077         function_spec=self.function_spec,

/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
    984         _, original_func = tf_decorator.unwrap(python_func)
    985 
--> 986       func_outputs = python_func(*func_args, **func_kwargs)
    987 
    988       # invariant: `func_outputs` contains only Tensors, CompositeTensors,

/usr/local/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in wrapped_fn(*args, **kwds)
    598         # __wrapped__ allows AutoGraph to swap in a converted function. We give
    599         # the function a weak reference to itself to avoid a reference cycle.
--> 600         return weak_wrapped_fn().__wrapped__(*args, **kwds)
    601     weak_wrapped_fn = weakref.ref(wrapped_fn)
    602 

/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/func_graph.py in wrapper(*args, **kwargs)
    971           except Exception as e:  # pylint:disable=broad-except
    972             if hasattr(e, "ag_error_metadata"):
--> 973               raise e.ag_error_metadata.to_exception(e)
    974             else:
    975               raise

ValueError: in user code:

    /usr/local/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:806 train_function  *
        return step_function(self, iterator)
    /usr/local/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:796 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /usr/local/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /usr/local/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /usr/local/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    /usr/local/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:789 run_step  **
        outputs = model.train_step(data)
    /usr/local/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:747 train_step
        y_pred = self(x, training=True)
    /usr/local/lib/python3.7/site-packages/tensorflow/python/keras/engine/base_layer.py:976 __call__
        self.name)
    /usr/local/lib/python3.7/site-packages/tensorflow/python/keras/engine/input_spec.py:180 assert_input_compatibility
        str(x.shape.as_list()))

    ValueError: Input 0 of layer sequential_3 is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: [1, 7]

Model Evaluation

In [ ]: