import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, confusion_matrix, r2_score, accuracy_score, recall_score, precision_score, classification_report
from sklearn.pipeline import make_pipeline
# load data for nasa
data = pd.read_csv('../data/ALL_DATA.csv')
print(data.columns)
print(data.head())
print(np.shape(data))
# *****************************
# HIGHLY IMPORTANT
# *****************************
# Sample data
print("Original Data Stats: \n")
print(data.describe())
print('\n--------\n')
print("New Sample Data Stats: \n")
# all_data['year'].fillna(0).astype(int)
# all_data['mass'].fillna(0).astype(int)
data = data.sample(frac=0.1) # 10% sample set
print(data.describe())
all_data = pd.DataFrame(data=data)
print(all_data)
# target variable
target = data.year
# features
features = data.drop(['year'], axis=1)
# print(data.head())
features.columns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
# ### Categorical data to be converted to numeric data
# class_data = list(all_data['class'])
# fall_data = list(all_data['fall'])
class_data = list(features['class'])
fall_data = list(features['fall'])
### integer mapping using LabelEncoder
le = LabelEncoder()
class_encoded = le.fit_transform(class_data)
fall_encoded = le.fit_transform(fall_data)
class_encoded = class_encoded.reshape(len(class_encoded), 1)
fall_encoded = fall_encoded.reshape(len(fall_encoded), 1)
### One hot encoding
onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoded_class = onehot_encoder.fit_transform(class_encoded)
onehot_encoded_fall = onehot_encoder.fit_transform(fall_encoded)
# print(onehot_encoded_class)
# all_data['class'] = onehot_encoded_class
# print(all_data['class'])
# print('\n\n\n')
# print(onehot_encoded_fall)
# all_data['fall'] = onehot_encoded_fall
# print(all_data['fall'])
print(onehot_encoded_class)
features['class'] = onehot_encoded_class
print(features['class'])
print('\n\n\n')
print(onehot_encoded_fall)
features['fall'] = onehot_encoded_fall
print(features['fall'])
# print(all_data.head)
print(features.head)
# count = all_data.year.value_counts()
# plt.bar(count.keys(), count.values)
count = target.value_counts()
plt.bar(count.keys(), count.values)
# heat map of correlation of features
correlation_matrix = all_data.corr()
fig = plt.figure(figsize=(9,6))
sns.heatmap(correlation_matrix,vmax=0.8,square = True, annot=True)
plt.show()
# # heat map of correlation of features
# correlation_matrix = features.corr()
# fig = plt.figure(figsize=(9,6))
# sns.heatmap(correlation_matrix,vmax=0.8, square = True, annot=True)
# plt.show()
# #Correlation with output variable
# cor_target = abs(correlation_matrix['year'])
# print(cor_target)
# #Selecting highly correlated features
# relevant_features = cor_target[cor_target>0.2]
# relevant_features
#Correlation with output variable
cor_target = abs(correlation_matrix['year'])
print(cor_target)
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.2]
relevant_features
def data_model(data, target): #x,y
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.25, random_state=0)
pipeline = make_pipeline(LinearRegression())
model = pipeline.fit(X_train, y_train)
return (X_test, y_test, model)
# print("Value distribution of features: ")
# print(list(all_data.iloc[0]))
# min_max = MinMaxScaler()
# data_min_max = min_max.fit_transform(all_data)
# print('\n')
# print("Value distribution after min max: ")
# print(list(data_min_max[0]))
# std = StandardScaler()
# data_std = std.fit_transform(all_data)
# print('\n')
# print("Value distribution after std: ")
# print(list(data_std[0]))
print("Value distribution of features: ")
print(list(features.iloc[0]))
min_max = MinMaxScaler()
data_min_max = min_max.fit_transform(features)
print('\n')
print("Value distribution after min max: ")
print(list(data_min_max[0]))
std = StandardScaler()
data_std = std.fit_transform(features)
print('\n')
print("Value distribution after std: ")
print(list(data_std[0]))
#Data Variable
# x = all_data[all_data.columns[0]]
x = features
print(x)
#Target Variable
# y = all_data[all_data.columns[1]]
y = target
print(y)
# print("Base:")
# X_test, y_test, model = data_model(all_data, y)
# prediction = model.predict(X_test)
# print("MSE: {}".format(mean_squared_error(y_test, prediction)))
# print("R Squared: {}".format(r2_score(y_test, prediction)))
# print('\n')
# print("MinMax:")
# X_test, y_test, model = data_model(data_min_max, y)
# prediction = model.predict(X_test)
# print("MSE: {}".format(mean_squared_error(y_test, prediction)))
# print("R Squared: {}".format(r2_score(y_test, prediction)))
# print("Confusion Matrix: {}".format(confusion_matrix(y_test, prediction.round())))
# print("Accuracy: {}".format(accuracy_score(y_test, prediction.round(), normalize=False)))
# print("Recall Score: {}".format(recall_score(y_test, prediction.round(), average=None)))
# print("Precision Score: {}".format(precision_score(y_test, prediction.round(), average=None)))
# print("Classification Report: {}".format(classification_report(y_test, prediction.round())))
# print('\n')
# print("Std:")
# X_test, y_test, model = data_model(data_std, y)
# prediction = model.predict(X_test)
# print("MSE: {}".format(mean_squared_error(y_test, prediction)))
# print("R Squared: {}".format(r2_score(y_test, prediction)))
# print("Confusion Matrix: {}".format(confusion_matrix(y_test, prediction.round())))
# print("Accuracy: {}".format(accuracy_score(y_test, prediction.round(), normalize=False)))
# print("Recall Score: {}".format(recall_score(y_test, prediction.round(), average=None)))
# print("Precision Score: {}".format(precision_score(y_test, prediction.round(), average=None)))
# print("Classification Report: {}".format(classification_report(y_test, prediction.round())))
print("Base:")
X_test, y_test, model = data_model(x, y)
prediction = model.predict(X_test)
print("MSE: {}".format(mean_squared_error(y_test, prediction)))
print("R Squared: {}".format(r2_score(y_test, prediction)))
print('\n')
print("MinMax:")
X_test, y_test, model = data_model(data_min_max, y)
prediction = model.predict(X_test)
print("MSE: {}".format(mean_squared_error(y_test, prediction)))
print("R Squared: {}".format(r2_score(y_test, prediction)))
# print("Confusion Matrix: {}".format(confusion_matrix(y_test, prediction.round())))
# print("Accuracy: {}".format(accuracy_score(y_test, prediction.round(), normalize=False)))
# print("Recall Score: {}".format(recall_score(y_test, prediction.round(), average=None)))
# print("Precision Score: {}".format(precision_score(y_test, prediction.round(), average=None)))
# print("Classification Report: {}".format(classification_report(y_test, prediction.round())))
print('\n')
print("Std:")
X_test, y_test, model = data_model(data_std, y)
prediction = model.predict(X_test)
print("MSE: {}".format(mean_squared_error(y_test, prediction)))
print("R Squared: {}".format(r2_score(y_test, prediction)))
# print("Confusion Matrix: {}".format(confusion_matrix(y_test, prediction.round())))
# print("Accuracy: {}".format(accuracy_score(y_test, prediction.round(), normalize=False)))
# print("Recall Score: {}".format(recall_score(y_test, prediction.round(), average=None)))
# print("Precision Score: {}".format(precision_score(y_test, prediction.round(), average=None)))
# print("Classification Report: {}".format(classification_report(y_test, prediction.round())))