Import dependencies
import pandas as pd
%matplotlib inline
from sklearn import datasets
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
#load csv into a data frame
redwine_df = pd.read_csv('wineQualityReds.csv')
redwine_df.head()
#rename/clean up column names
#define class as quality points (6)
#quality of wine rated on a 1-10 scale: data scores between 2-7 (6 classes of wine quality)
wine = pd.read_csv(os.path.join('wineQualityReds.csv'))
new_wine= wine.rename(columns={"quality": "Class","Unnamed: 0": "count"})
new_wine.head()
Visual showing the correaltion between these chemical components in red wine.
fig = plt.subplots(figsize = (12,8))
sns.heatmap(new_wine.corr())
plt.savefig("correlation.png")
Quantifying values of correlation matrix.
correlation = new_wine.corr()
correlation.head()
Machine Learning model for clustering the wine based on the 6 classes.
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=6)
kmeans.fit(redwine_df)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
n_clusters=6, n_init=10, n_jobs=1, precompute_distances='auto',
random_state=None, tol=0.0001, verbose=0)
predicted_clusters = kmeans.predict(redwine_df)
wine_data = pd.read_csv('wineQualityReds.csv')
new_wine = wine_data.rename(columns={"quality": "Class"})
wine_df = pd.DataFrame(new_wine)
wine_df.Class = wine_df.Class - 1
A rule of thumb is, that the lower the percentage of alcohol, the more sugar is still in the fluid = heavier liquid
from: https://bartenderly.com/tips-tricks/alcohol-density-chart/
from sklearn.cluster import KMeans
wine_df.plot.scatter(x = 'alcohol', y = 'density', c= 'Class', figsize=(12,8), colormap='jet')
plt.savefig("densityalcoholscatter.png")
kmeans = KMeans(n_clusters=6, init = 'k-means++', max_iter = 1000,
random_state = None).fit(wine_df.iloc[:,[11,8]])
centroids_df = pd.DataFrame(kmeans.cluster_centers_, columns = list(wine_df.iloc[:,[11,8]].columns.values))
fig, ax = plt.subplots(1, 1)
wine_df.plot.scatter(x = 'alcohol', y = 'density', c= 'Class', figsize=(12,8),
colormap='jet', ax=ax, mark_right=False)
centroids_df.plot.scatter(x = 'alcohol', y = 'density', c = 'black', ax = ax, s = 100, marker='s')
plt.savefig("densityalcoholcluster.png")
centroids_df.head()
Linear regression on centroids.
from scipy.stats import linregress
slope, intercept, r, p, error = linregress(centroids_df["alcohol"], centroids_df["density"])
# slope, intercept, r, p, error
fit = slope * centroids_df["alcohol"] + intercept
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(centroids_df["alcohol"], centroids_df["density"],linewidth="0", marker='s', color='black') #plot each instance
ax.plot(centroids_df["alcohol"], fit, 'r--') #plot the regresion calc, with a red dotted line
plt.title("Density vs. Alcohol")
plt.xlabel("Alcohol")
plt.ylabel("Density")
plt.grid(True)
plt.savefig("densityalcoholregress.png")
print("r:", r,
"error:", error)
In wine, the more acidity it has, the more tart the wine will taste.
wine_df.plot.scatter(x = 'fixed.acidity', y = 'density', c= 'Class', figsize=(12,8), colormap='jet')
plt.savefig("densityacidityscatter.png")
kmeans = KMeans(n_clusters=6, init = 'k-means++', max_iter = 1000, random_state = None).fit(wine_df.iloc[:,[1,8]])
centroids_df2 = pd.DataFrame(kmeans.cluster_centers_, columns = list(wine_df.iloc[:,[1,8]].columns.values))
fit = slope * centroids_df2["fixed.acidity"] + intercept
fig, ax = plt.subplots(1, 1)
ax.plot(centroids_df2["fixed.acidity"], centroids_df2["density"],linewidth="0") #plot each instance
wine_df.plot.scatter(x = 'fixed.acidity', y = 'density', c= 'Class', figsize=(12,8), colormap='jet', ax=ax,
mark_right=False)
centroids_df2.plot.scatter(x = 'fixed.acidity', y = 'density', c = 'black', ax = ax, s = 100, marker='s')
plt.title("Density vs. Fixed Acidity")
plt.xlabel("Fixed Acidity")
plt.ylabel("Density")
plt.grid(True)
plt.savefig("densityacidityfinal.png")
centroids_df2.head()
slope, intercept, r, p, error = linregress(centroids_df2["fixed.acidity"], centroids_df2["density"])
fit = slope * centroids_df2["fixed.acidity"] + intercept
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(centroids_df2["fixed.acidity"], centroids_df2["density"],linewidth="0",
marker='s', color='black') #plot each instance
ax.plot(centroids_df2["fixed.acidity"], fit, 'r--') #plot the regresion calc, with a red dotted line
plt.title("Density vs. Fixed Acidity")
plt.xlabel("Fixed Acidity")
plt.ylabel("Density")
plt.grid(True)
plt.savefig("densityacidityregress.png")
print("r:", r,
"error:", error)