import pandas as pd
Project description
import pandas as pd from sklearn import preprocessing import numpy as np from sklearn.discriminant_analysis import StandardScaler from sklearn.preprocessing import scale, MinMaxScaler,OneHotEncoder from sklearn.decomposition import PCA import matplotlib.pyplot as plt from apyori import apriori from sklearn.naive_bayes import GaussianNB from sklearn.cluster import KMeans from sklearn.metrics import classification_report, confusion_matrix, accuracy_score from sklearn.model_selection import train_test_split
df=pd.read_csv("housing.csv") print(df) print(df.head()) print(df.tail()) print(df.max()) print(df.min())
median_house_value_column = df['median_house_value']
print(median_house_value_column.var()) print(median_house_value_column.median()) print(median_house_value_column.mode()) print(median_house_value_column.mean()) print(median_house_value_column.std()) print(median_house_value_column.count()) print(median_house_value_column.describe()) print(median_house_value_column.dtypes) df = df.dropna() print(df)
x_array = np.array(df['median_house_value']).reshape(-1, 1) normalized_arr = preprocessing.normalize(x_array) print(normalized_arr)
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist() categorical_columns = list(set(df.columns) - set(numeric_columns)) df_encoded = pd.get_dummies(df, columns=categorical_columns) scaler = MinMaxScaler(feature_range=(0, 2)) df_encoded[numeric_columns] = scaler.fit_transform(df_encoded[numeric_columns]) print(df_encoded.head())
pca = PCA() pca.fit(df_encoded) print(pca.explained_variance_ratio_)
df1 = pd.read_csv("student.csv",header=0) df2 = pd.read_csv("mark.csv",header=0) df_stu=pd.merge(df1,df2,on='Student_id')
print(df_stu.head()) print(df_stu.shape)
store_data = pd.read_csv("store_data.csv", header=None) print(store_data.head()) records=[]
for i in range(0,7501): records.append([str(store_data.values[i,j] ) for j in range(0,20)])
association_rules = apriori(records, min_support=0.0045, min_confidence=0.2, min_lift=3, min_length=2) association_results = list(association_rules) print(association_results)
print("There are {} Relation derived.".format(len(association_results))) for i in range(0, len(association_results)): print(association_results[i][0])
for item in association_results: # first index of the inner list # Contains base item and add item pair = item[0] items = [x for x in pair] print("Rule: " + items[0] + " -> " + items[1])
# second index of the inner list
print("Support: " + str(item[1]))
# third index of the list located at 0th
# of the third index of the inner list
print("Confidence: " + str(item[2][0][2]))
print("Lift: " + str(item[2][0][3]))
print("=====================================")
X = np.array([[5,3], [10,15], [15,12], [24,10], [30,45], [85,70], [71,80], [60,78], [55,52], [80,91]]) kmeans = KMeans(n_clusters=2) # k=2 0 , 1 kmeans.fit(X)
print(kmeans.cluster_centers_) plt.scatter(X[:,0],X[:,1], label='True Position') plt.show() plt.scatter(X[:,0],X[:,1], c=kmeans.labels_, cmap='rainbow') plt.show() plt.scatter(X[:,0], X[:,1], c=kmeans.labels_, cmap='rainbow') plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black') plt.show()
df_kmeans = pd.get_dummies(df, drop_first=True) #to remove redundandant column print(df_kmeans.head())
scaler = StandardScaler() scaled_df_kmeans = scaler.fit_transform(df_kmeans)
Y = df['housing_median_age'] X = df.drop(columns=['ocean_proximity'])
Applying KMeans Clustering to the housing data
kmeans_model = KMeans(n_clusters=3) clusters = kmeans_model.fit_predict(X) X.insert(X.columns.get_loc("housing_median_age"), "Cluster", clusters)
Displaying the clusters
print(X["Cluster"].value_counts())
Additional exploratory analysis for determining the number of clusters
ssd = [] for k in range(2, 9): kmeans_model = KMeans(n_clusters=k) kmeans_model.fit(X) ssd.append(kmeans_model.inertia_)
Plotting SSD for different K values
plt.figure(figsize=(6, 4), dpi=100) plt.plot(range(2, 9), ssd, color="green", marker="o") plt.xlabel("Number of clusters (K)") plt.ylabel("SSD for K") plt.show()
Splitting the data for Gaussian Naive Bayes classification
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)
Gaussian Naive Bayes classifier
naive = GaussianNB() naive_model = naive.fit(x_train, y_train)
Predictions and evaluation
y_pred = naive_model.predict(x_test) accuracy = accuracy_score(y_test, y_pred) print('The accuracy of the model is: ', accuracy)
print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred))
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
Built Distribution
Filter files by name, interpreter, ABI, and platform.
If you're not sure about the file name format, learn more about wheel file names.
Copy a direct link to the current filters
File details
Details for the file shimpiproductions-0.2.tar.gz.
File metadata
- Download URL: shimpiproductions-0.2.tar.gz
- Upload date:
- Size: 5.0 kB
- Tags: Source
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/4.0.2 CPython/3.12.0
File hashes
| Algorithm | Hash digest | |
|---|---|---|
| SHA256 |
1b7c2ea6a54d78252894bf6cef9c82f89cc0b4d75caea164252b780a3b388728
|
|
| MD5 |
7c052a77202fbb8a5b9987ee97ce13ad
|
|
| BLAKE2b-256 |
9e33718744f54ab2e8a740124e34848e2f0a1595ec0e63a108200c86e0b39b54
|
File details
Details for the file shimpiproductions-0.2-py3-none-any.whl.
File metadata
- Download URL: shimpiproductions-0.2-py3-none-any.whl
- Upload date:
- Size: 4.2 kB
- Tags: Python 3
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/4.0.2 CPython/3.12.0
File hashes
| Algorithm | Hash digest | |
|---|---|---|
| SHA256 |
59ca6fca1abc79d65f8de3eadab7399d183ca9b53f629153440e378ec48accd0
|
|
| MD5 |
6c591aaa32e837b494b84ac563b001c5
|
|
| BLAKE2b-256 |
68a875b9cf461ea80896c660b91db7afa8c99caa1c643501e6fda4e100a9053a
|