Skip to main content

import pandas as pd

Project description

import pandas as pd from sklearn import preprocessing import numpy as np from sklearn.discriminant_analysis import StandardScaler from sklearn.preprocessing import scale, MinMaxScaler,OneHotEncoder from sklearn.decomposition import PCA import matplotlib.pyplot as plt from apyori import apriori from sklearn.naive_bayes import GaussianNB from sklearn.cluster import KMeans from sklearn.metrics import classification_report, confusion_matrix, accuracy_score from sklearn.model_selection import train_test_split

df=pd.read_csv("housing.csv") print(df) print(df.head()) print(df.tail()) print(df.max()) print(df.min())

median_house_value_column = df['median_house_value']

print(median_house_value_column.var()) print(median_house_value_column.median()) print(median_house_value_column.mode()) print(median_house_value_column.mean()) print(median_house_value_column.std()) print(median_house_value_column.count()) print(median_house_value_column.describe()) print(median_house_value_column.dtypes) df = df.dropna() print(df)

x_array = np.array(df['median_house_value']).reshape(-1, 1) normalized_arr = preprocessing.normalize(x_array) print(normalized_arr)

numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist() categorical_columns = list(set(df.columns) - set(numeric_columns)) df_encoded = pd.get_dummies(df, columns=categorical_columns) scaler = MinMaxScaler(feature_range=(0, 2)) df_encoded[numeric_columns] = scaler.fit_transform(df_encoded[numeric_columns]) print(df_encoded.head())

pca = PCA() pca.fit(df_encoded) print(pca.explained_variance_ratio_)

df1 = pd.read_csv("student.csv",header=0) df2 = pd.read_csv("mark.csv",header=0) df_stu=pd.merge(df1,df2,on='Student_id')

print(df_stu.head()) print(df_stu.shape)

store_data = pd.read_csv("store_data.csv", header=None) print(store_data.head()) records=[]

for i in range(0,7501): records.append([str(store_data.values[i,j] ) for j in range(0,20)])

association_rules = apriori(records, min_support=0.0045, min_confidence=0.2, min_lift=3, min_length=2) association_results = list(association_rules) print(association_results)

print("There are {} Relation derived.".format(len(association_results))) for i in range(0, len(association_results)): print(association_results[i][0])

for item in association_results: # first index of the inner list # Contains base item and add item pair = item[0] items = [x for x in pair] print("Rule: " + items[0] + " -> " + items[1])

# second index of the inner list
print("Support: " + str(item[1]))

# third index of the list located at 0th
# of the third index of the inner list

print("Confidence: " + str(item[2][0][2]))
print("Lift: " + str(item[2][0][3]))
print("=====================================")

X = np.array([[5,3], [10,15], [15,12], [24,10], [30,45], [85,70], [71,80], [60,78], [55,52], [80,91]]) kmeans = KMeans(n_clusters=2) # k=2 0 , 1 kmeans.fit(X)

print(kmeans.cluster_centers_) plt.scatter(X[:,0],X[:,1], label='True Position') plt.show() plt.scatter(X[:,0],X[:,1], c=kmeans.labels_, cmap='rainbow') plt.show() plt.scatter(X[:,0], X[:,1], c=kmeans.labels_, cmap='rainbow') plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black') plt.show()

df_kmeans = pd.get_dummies(df, drop_first=True) #to remove redundandant column print(df_kmeans.head())

scaler = StandardScaler() scaled_df_kmeans = scaler.fit_transform(df_kmeans)

Y = df['housing_median_age'] X = df.drop(columns=['ocean_proximity'])

Applying KMeans Clustering to the housing data

kmeans_model = KMeans(n_clusters=3) clusters = kmeans_model.fit_predict(X) X.insert(X.columns.get_loc("housing_median_age"), "Cluster", clusters)

Displaying the clusters

print(X["Cluster"].value_counts())

Additional exploratory analysis for determining the number of clusters

ssd = [] for k in range(2, 9): kmeans_model = KMeans(n_clusters=k) kmeans_model.fit(X) ssd.append(kmeans_model.inertia_)

Plotting SSD for different K values

plt.figure(figsize=(6, 4), dpi=100) plt.plot(range(2, 9), ssd, color="green", marker="o") plt.xlabel("Number of clusters (K)") plt.ylabel("SSD for K") plt.show()

Splitting the data for Gaussian Naive Bayes classification

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

Gaussian Naive Bayes classifier

naive = GaussianNB() naive_model = naive.fit(x_train, y_train)

Predictions and evaluation

y_pred = naive_model.predict(x_test) accuracy = accuracy_score(y_test, y_pred) print('The accuracy of the model is: ', accuracy)

print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred))

Project details


Release history Release notifications | RSS feed

This version

0.2

Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

shimpiproductions-0.2.tar.gz (5.0 kB view hashes)

Uploaded Source

Built Distribution

shimpiproductions-0.2-py3-none-any.whl (4.2 kB view hashes)

Uploaded Python 3

Supported by

AWS AWS Cloud computing and Security Sponsor Datadog Datadog Monitoring Fastly Fastly CDN Google Google Download Analytics Microsoft Microsoft PSF Sponsor Pingdom Pingdom Monitoring Sentry Sentry Error logging StatusPage StatusPage Status page