import pandas as pd
Project description
import pandas as pd from sklearn import preprocessing import numpy as np from sklearn.discriminant_analysis import StandardScaler from sklearn.preprocessing import scale, MinMaxScaler,OneHotEncoder from sklearn.decomposition import PCA import matplotlib.pyplot as plt from apyori import apriori from sklearn.naive_bayes import GaussianNB from sklearn.cluster import KMeans from sklearn.metrics import classification_report, confusion_matrix, accuracy_score from sklearn.model_selection import train_test_split
df=pd.read_csv("housing.csv") print(df) print(df.head()) print(df.tail()) print(df.max()) print(df.min())
median_house_value_column = df['median_house_value']
print(median_house_value_column.var()) print(median_house_value_column.median()) print(median_house_value_column.mode()) print(median_house_value_column.mean()) print(median_house_value_column.std()) print(median_house_value_column.count()) print(median_house_value_column.describe()) print(median_house_value_column.dtypes) df = df.dropna() print(df)
x_array = np.array(df['median_house_value']).reshape(-1, 1) normalized_arr = preprocessing.normalize(x_array) print(normalized_arr)
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist() categorical_columns = list(set(df.columns) - set(numeric_columns)) df_encoded = pd.get_dummies(df, columns=categorical_columns) scaler = MinMaxScaler(feature_range=(0, 2)) df_encoded[numeric_columns] = scaler.fit_transform(df_encoded[numeric_columns]) print(df_encoded.head())
pca = PCA() pca.fit(df_encoded) print(pca.explained_variance_ratio_)
df1 = pd.read_csv("student.csv",header=0) df2 = pd.read_csv("mark.csv",header=0) df_stu=pd.merge(df1,df2,on='Student_id')
print(df_stu.head()) print(df_stu.shape)
store_data = pd.read_csv("store_data.csv", header=None) print(store_data.head()) records=[]
for i in range(0,7501): records.append([str(store_data.values[i,j] ) for j in range(0,20)])
association_rules = apriori(records, min_support=0.0045, min_confidence=0.2, min_lift=3, min_length=2) association_results = list(association_rules) print(association_results)
print("There are {} Relation derived.".format(len(association_results))) for i in range(0, len(association_results)): print(association_results[i][0])
for item in association_results: # first index of the inner list # Contains base item and add item pair = item[0] items = [x for x in pair] print("Rule: " + items[0] + " -> " + items[1])
# second index of the inner list
print("Support: " + str(item[1]))
# third index of the list located at 0th
# of the third index of the inner list
print("Confidence: " + str(item[2][0][2]))
print("Lift: " + str(item[2][0][3]))
print("=====================================")
X = np.array([[5,3], [10,15], [15,12], [24,10], [30,45], [85,70], [71,80], [60,78], [55,52], [80,91]]) kmeans = KMeans(n_clusters=2) # k=2 0 , 1 kmeans.fit(X)
print(kmeans.cluster_centers_) plt.scatter(X[:,0],X[:,1], label='True Position') plt.show() plt.scatter(X[:,0],X[:,1], c=kmeans.labels_, cmap='rainbow') plt.show() plt.scatter(X[:,0], X[:,1], c=kmeans.labels_, cmap='rainbow') plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black') plt.show()
df_kmeans = pd.get_dummies(df, drop_first=True) #to remove redundandant column print(df_kmeans.head())
scaler = StandardScaler() scaled_df_kmeans = scaler.fit_transform(df_kmeans)
Y = df['housing_median_age'] X = df.drop(columns=['ocean_proximity'])
Applying KMeans Clustering to the housing data
kmeans_model = KMeans(n_clusters=3) clusters = kmeans_model.fit_predict(X) X.insert(X.columns.get_loc("housing_median_age"), "Cluster", clusters)
Displaying the clusters
print(X["Cluster"].value_counts())
Additional exploratory analysis for determining the number of clusters
ssd = [] for k in range(2, 9): kmeans_model = KMeans(n_clusters=k) kmeans_model.fit(X) ssd.append(kmeans_model.inertia_)
Plotting SSD for different K values
plt.figure(figsize=(6, 4), dpi=100) plt.plot(range(2, 9), ssd, color="green", marker="o") plt.xlabel("Number of clusters (K)") plt.ylabel("SSD for K") plt.show()
Splitting the data for Gaussian Naive Bayes classification
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)
Gaussian Naive Bayes classifier
naive = GaussianNB() naive_model = naive.fit(x_train, y_train)
Predictions and evaluation
y_pred = naive_model.predict(x_test) accuracy = accuracy_score(y_test, y_pred) print('The accuracy of the model is: ', accuracy)
print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred))
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
Built Distribution
Hashes for shimpiproductions-0.2-py3-none-any.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 59ca6fca1abc79d65f8de3eadab7399d183ca9b53f629153440e378ec48accd0 |
|
MD5 | 6c591aaa32e837b494b84ac563b001c5 |
|
BLAKE2b-256 | 68a875b9cf461ea80896c660b91db7afa8c99caa1c643501e6fda4e100a9053a |