2 Commits

Author SHA1 Message Date
ION606 88f2975b86 added lab 4 2025-10-31 17:55:13 -04:00
ION606 dc2ceac7de transfer 2025-10-17 09:26:49 -04:00
60 changed files with 730 additions and 5 deletions

Before

Width:  |  Height:  |  Size: 24 KiB

After

Width:  |  Height:  |  Size: 24 KiB

Before

Width:  |  Height:  |  Size: 25 KiB

After

Width:  |  Height:  |  Size: 25 KiB

Before

Width:  |  Height:  |  Size: 36 KiB

After

Width:  |  Height:  |  Size: 36 KiB

Before

Width:  |  Height:  |  Size: 42 KiB

After

Width:  |  Height:  |  Size: 42 KiB

Before

Width:  |  Height:  |  Size: 63 KiB

After

Width:  |  Height:  |  Size: 63 KiB

Before

Width:  |  Height:  |  Size: 62 KiB

After

Width:  |  Height:  |  Size: 62 KiB

Before

Width:  |  Height:  |  Size: 63 KiB

After

Width:  |  Height:  |  Size: 63 KiB

Before

Width:  |  Height:  |  Size: 64 KiB

After

Width:  |  Height:  |  Size: 64 KiB

-5
View File
@@ -1,5 +0,0 @@
node_modules
.venv
.vscode
Assignment III
tmp/
BIN
View File
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,41 @@
##########################################
### Principal Component Analysis (PCA) ###
##########################################
## load libraries
library(ggplot2)
library(ggfortify)
library(GGally)
library(e1071)
library(class)
library(psych)
library(readr)
## set working directory so that files can be referenced without the full path
setwd("/home/ion606/Desktop/Data Analytics/Lab 4")
## read dataset
wine <- read_csv("wine.data", col_names = FALSE)
## set column names
names(wine) <- c("Type","Alcohol","Malic acid","Ash","Alcalinity of ash","Magnesium","Total phenols","Flavanoids","Nonflavanoid Phenols","Proanthocyanins","Color Intensity","Hue","Od280/od315 of diluted wines","Proline")
## inspect data frame
head(wine)
## change the data type of the "Type" column from character to factor
####
# Factors look like regular strings (characters) but with factors R knows
# that the column is a categorical variable with finite possible values
# e.g. "Type" in the Wine dataset can only be 1, 2, or 3
####
wine$Type <- as.factor(wine$Type)
## visualize variables
pairs.panels(wine[,-1],gap = 0,bg = c("red", "yellow", "blue")[wine$Type],pch=21)
ggpairs(wine, ggplot2::aes(colour = Type))
###
Binary file not shown.
+366
View File
@@ -0,0 +1,366 @@
has_pkg <- function(pkg) requireNamespace(pkg, quietly = TRUE)
has_ggplot2 <- has_pkg("ggplot2")
has_GGally <- has_pkg("GGally")
has_e1071 <- has_pkg("e1071")
has_class <- has_pkg("class")
has_psych <- has_pkg("psych")
has_readr <- has_pkg("readr")
# WHY IS THIS HERE YOU MIGHT ASK???? WELL LET ME TELL YOU I SPENT TWO HOURS ON STUPID PACKAGE IMPORTS
# OOOOOOHHH PSYCH IS IN A DIFFERENT REPO??? OH IT ISN'T??? I have a fever of 103 I DO NOT CARE
if (has_ggplot2) { library(ggplot2) } else { warning("ggplot2 not available; plots will be skipped") }
if (has_GGally) { library(GGally) } else { message("GGally not available; skipping ggpairs plot") }
if (has_e1071) { library(e1071) }
if (has_class) { library(class) } else { stop("class package not available for kNN") }
if (!has_psych) { message("psych not available; skipping pairs.panels plot") }
if (has_readr) { library(readr) }
library(grid) # unit() for arrows in plots
suppressWarnings(RNGkind(sample.kind = "Rounding"))
# set a reproducible seed
set.seed(4600)
# 178 rows
# col 1 is class label (1,2,3)
# other 13 columns continuous predictors
possible_paths <- c(
"wine.data",
"./wine.data",
"../wine.data",
"DAN/wine.data",
"./DAN/wine.data"
)
data_path <- NA
for (p in possible_paths) { if (file.exists(p)) { data_path <- p; break } }
if (is.na(data_path)) stop("could not find wine.data; place this script in the DAN folder or given/ and re-run")
if (has_readr) {
wine <- readr::read_csv(
file = data_path,
col_names = FALSE,
show_col_types = FALSE,
progress = FALSE
)
} else {
wine <- read.csv(file = data_path, header = FALSE)
}
colnames(wine) <- c(
"Type",
"Alcohol",
"Malic_acid",
"Ash",
"Alcalinity_of_ash",
"Magnesium",
"Total_phenols",
"Flavanoids",
"Nonflavanoid_phenols",
"Proanthocyanins",
"Color_intensity",
"Hue",
"OD280_OD315",
"Proline"
)
wine$Type <- as.factor(wine$Type)
# put here from when I accidentally read in the wrong file repeatedly
# left because it makes it more, "robust"
stopifnot(nrow(wine) == 178, ncol(wine) == 14)
print(summary(wine$Type))
# exploratory plots (because I went down a rabbit hole and by god I'm using it)
if (has_psych) {
# pairs panel (psych) colors by class
psych::pairs.panels(
wine[,-1],
gap = 0,
bg = c("red","gold","royalblue")[wine$Type],
pch = 21,
main = "wine (uci) scatterplot matrix by class"
)
}
if (has_GGally && has_ggplot2) {
# ggpairs for nice matrix <3
GGally::ggpairs(wine, ggplot2::aes(colour = Type), columns = 2:ncol(wine))
}
# split into train/test BEFORE!!!!!!!!!!!!!!!!!!!!!! any preprocessing to avoid leakage
set.seed(4600)
n <- nrow(wine)
train_idx <- sample.int(n, size = floor(0.7 * n))
wine_train <- wine[train_idx, , drop = FALSE]
wine_test <- wine[-train_idx, , drop = FALSE]
X_train <- wine_train[, -1]
y_train <- wine_train$Type
X_test <- wine_test[, -1]
y_test <- wine_test$Type
# yes
if (any(sapply(X_train, function(x) var(x, na.rm = TRUE) == 0))) {
warning("one or more predictors have zero variance in the training set; scale() would fail")
}
if (anyNA(X_train) | anyNA(X_test)) {
stop("found NA values in predictors; handle missingness before PCA")
}
# project both train and test using the train-fitted pca
pca_tr <- prcomp(X_train, center = TRUE, scale. = TRUE)
pve_tr <- (pca_tr$sdev^2) / sum(pca_tr$sdev^2)
pve_df <- data.frame(
PC = paste0("PC", seq_along(pve_tr)),
PVE = pve_tr,
CumPVE = cumsum(pve_tr)
)
print("variance explained (training pca):")
print(pve_df)
# scree plot from training pca
p_scree <- ggplot(pve_df, aes(x = seq_along(PVE), y = PVE)) +
geom_line() + geom_point() +
scale_x_continuous(breaks = 1:length(pve_df$PC), labels = pve_df$PC) +
labs(title = "scree plot variance explained by principal components (training pca)",
x = "principal component", y = "proportion of variance explained") +
theme_minimal()
# cumulative variance plot from training pca
p_cumvar <- ggplot(pve_df, aes(x = seq_along(CumPVE), y = CumPVE)) +
geom_line() + geom_point() +
scale_x_continuous(breaks = 1:length(pve_df$PC), labels = pve_df$PC) +
labs(title = "cumulative variance explained (training pca)",
x = "principal component", y = "cumulative proportion of variance") +
theme_minimal()
# ========================================================================================================
# choose number of pcs: default to the smallest k with >= thresh cum variance
# you can change thresh to 0.90 or 0.99 if you prefer
pc_variance_threshold <- 0.95
k_pcs <- which(cumsum(pve_tr) >= pc_variance_threshold)[1]
if (is.na(k_pcs)) k_pcs <- ncol(X_train) # crashes if fails so...
cat("chosen number of pcs (threshold =", pc_variance_threshold, "):", k_pcs, "\n")
# project train/test into the pca space
Z_train_full <- as.data.frame(predict(pca_tr, newdata = X_train))
Z_test_full <- as.data.frame(predict(pca_tr, newdata = X_test))
# for downstream modeling
Z_train <- Z_train_full[, seq_len(k_pcs), drop = FALSE]
Z_test <- Z_test_full[, seq_len(k_pcs), drop = FALSE]
scores_all <- as.data.frame(predict(pca_tr, newdata = wine[,-1]))
scores_all$Type <- wine$Type
# loadings from training pca
loadings <- as.data.frame(pca_tr$rotation)
loadings$Variable <- rownames(loadings)
top_pc1 <- loadings[order(abs(loadings$PC1), decreasing = TRUE), c("Variable","PC1")][1:5, ]
top_pc2 <- loadings[order(abs(loadings$PC2), decreasing = TRUE), c("Variable","PC2")][1:5, ]
print("top contributors to pc1 (training pca):"); print(top_pc1)
print("top contributors to pc2 (training pca):"); print(top_pc2)
# function to make convex hull data for each group
scores <- scores_all
hull_df <- do.call(rbind, lapply(split(scores, scores$Type), function(df) {
pts <- df[chull(df$PC1, df$PC2), c("PC1","PC2")]
pts$Type <- unique(df$Type)
pts
}))
p_pc12 <- ggplot(scores, aes(PC1, PC2, color = Type)) +
geom_point(size = 2, alpha = 0.85) +
geom_polygon(data = hull_df, aes(fill = Type, group = Type), color = NA, alpha = 0.15) +
guides(fill = "none") +
theme_minimal() +
labs(title = "pc1 vs pc2 by class (projected with training pca)")
# arrow arrow arrow arrow arrow arrow arrow arrow arrow
loading_scalefactor <- 3 * max(abs(scores$PC1), abs(scores$PC2)) # heuristic
load_plot_df <- loadings
load_plot_df$PC1s <- load_plot_df$PC1 * loading_scalefactor
load_plot_df$PC2s <- load_plot_df$PC2 * loading_scalefactor
p_biplot <- ggplot(scores, aes(PC1, PC2, color = Type)) +
geom_point(size = 2, alpha = 0.85) +
geom_segment(
data = load_plot_df,
mapping = aes(x = 0, y = 0, xend = PC1s, yend = PC2s),
inherit.aes = FALSE,
arrow = arrow(length = unit(0.02, "npc")),
color = "black",
alpha = 0.8
) +
geom_text(
data = load_plot_df,
mapping = aes(x = PC1s, y = PC2s, label = Variable),
inherit.aes = FALSE,
hjust = 0,
vjust = 0
) +
theme_minimal() +
labs(title = "pc1 vs pc2 with variable loadings (training pca projection)")
# 1) kNN on original variables with standardization
# 2) kNN on first 2 principal components only
# helper to create metrics from a confusion matrix (rows=true, cols=pred)
compute_metrics <- function(cm) {
lv <- rownames(cm)
if (is.null(lv)) lv <- as.character(1:nrow(cm))
TP <- diag(cm)
FP <- colSums(cm) - TP
FN <- rowSums(cm) - TP
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
f1 <- 2 * precision * recall / (precision + recall)
acc <- sum(TP) / sum(cm)
macro_precision <- mean(precision, na.rm = TRUE)
macro_recall <- mean(recall, na.rm = TRUE)
macro_f1 <- mean(f1, na.rm = TRUE)
per_class <- data.frame(
class = lv,
precision = precision,
recall = recall,
f1 = f1,
row.names = NULL
)
summary <- data.frame(
accuracy = acc,
macro_precision = macro_precision,
macro_recall = macro_recall,
macro_f1 = macro_f1
)
list(per_class = per_class, summary = summary)
}
set.seed(4600)
ks <- seq(1, 15, by = 2)
Kfolds <- 5
# kNN on original vars
X_train_scaled <- scale(X_train, center = TRUE, scale = TRUE)
scale_center <- attr(X_train_scaled, "scaled:center")
scale_scale <- attr(X_train_scaled, "scaled:scale")
X_test_scaled <- scale(X_test, center = scale_center, scale = scale_scale)
n_train_orig <- nrow(X_train_scaled)
folds_orig <- sample(rep(1:Kfolds, length.out = n_train_orig))
cv_acc_orig <- sapply(ks, function(k) {
mean(sapply(1:Kfolds, function(f) {
tr <- which(folds_orig != f)
va <- which(folds_orig == f)
pred_cv <- knn(train = X_train_scaled[tr, , drop = FALSE],
test = X_train_scaled[va, , drop = FALSE],
cl = y_train[tr], k = k)
mean(pred_cv == y_train[va])
}))
})
best_k_orig <- ks[which.max(cv_acc_orig)]
cat("[Original vars] best k:", best_k_orig, "cv acc:", max(cv_acc_orig), "\n")
pred_orig <- knn(train = X_train_scaled, test = X_test_scaled, cl = y_train, k = best_k_orig)
acc_orig <- mean(pred_orig == y_test)
cm_orig <- table(truth = y_test, pred = pred_orig)
cat("[Original vars] held-out accuracy:", round(acc_orig, 4), "\n")
print(cm_orig)
metrics_orig <- compute_metrics(cm_orig)
print(metrics_orig$summary)
print(metrics_orig$per_class)
# kNN on first 2 PCs only
Z2_train <- Z_train_full[, 1:2, drop = FALSE]
Z2_test <- Z_test_full[, 1:2, drop = FALSE]
n_train_2pc <- nrow(Z2_train)
folds_2pc <- sample(rep(1:Kfolds, length.out = n_train_2pc))
cv_acc_2pc <- sapply(ks, function(k) {
mean(sapply(1:Kfolds, function(f) {
tr <- which(folds_2pc != f)
va <- which(folds_2pc == f)
pred_cv <- knn(train = Z2_train[tr, , drop = FALSE],
test = Z2_train[va, , drop = FALSE],
cl = y_train[tr], k = k)
mean(pred_cv == y_train[va])
}))
})
best_k_2pc <- ks[which.max(cv_acc_2pc)]
cat("[First 2 PCs] best k:", best_k_2pc, "cv acc:", max(cv_acc_2pc), "\n")
pred_2pc <- knn(train = Z2_train, test = Z2_test, cl = y_train, k = best_k_2pc)
acc_2pc <- mean(pred_2pc == y_test)
cm_2pc <- table(truth = y_test, pred = pred_2pc)
cat("[First 2 PCs] held-out accuracy:", round(acc_2pc, 4), "\n")
print(cm_2pc)
metrics_2pc <- compute_metrics(cm_2pc)
print(metrics_2pc$summary)
print(metrics_2pc$per_class)
# ===========================================================================================
outputs_dir <- "outputs"
if (!dir.exists(outputs_dir)) dir.create(outputs_dir, recursive = TRUE, showWarnings = FALSE)
# plots
if (exists("p_pc12") && inherits(p_pc12, "ggplot")) ggsave(filename = file.path(outputs_dir, "pc12_scatter.png"), plot = p_pc12, width = 8, height = 6, dpi = 300)
if (exists("p_biplot") && inherits(p_biplot, "ggplot")) ggsave(filename = file.path(outputs_dir, "pc12_biplot.png"), plot = p_biplot, width = 8, height = 6, dpi = 300)
if (exists("p_scree") && inherits(p_scree, "ggplot")) ggsave(filename = file.path(outputs_dir, "pca_scree.png"), plot = p_scree, width = 8, height = 6, dpi = 300)
if (exists("p_cumvar") && inherits(p_cumvar, "ggplot")) ggsave(filename = file.path(outputs_dir, "pca_cumvar.png"), plot = p_cumvar, width = 8, height = 6, dpi = 300)
# top contributors/vars to PC1 and PC2
write.csv(top_pc1, file = file.path(outputs_dir, "top_contributors_pc1.csv"), row.names = FALSE)
write.csv(top_pc2, file = file.path(outputs_dir, "top_contributors_pc2.csv"), row.names = FALSE)
# confusion matrices as wide CSV and pretty text
write.csv(as.matrix(cm_orig), file = file.path(outputs_dir, "confusion_original_wide.csv"))
writeLines(capture.output(cm_orig), con = file.path(outputs_dir, "confusion_original.txt"))
write.csv(as.matrix(cm_2pc), file = file.path(outputs_dir, "confusion_2pc_wide.csv"))
writeLines(capture.output(cm_2pc), con = file.path(outputs_dir, "confusion_2pc.txt"))
# metrics
write.csv(metrics_orig$per_class, file = file.path(outputs_dir, "metrics_original_per_class.csv"), row.names = FALSE)
write.csv(metrics_orig$summary, file = file.path(outputs_dir, "metrics_original_summary.csv"), row.names = FALSE)
write.csv(metrics_2pc$per_class, file = file.path(outputs_dir, "metrics_2pc_per_class.csv"), row.names = FALSE)
write.csv(metrics_2pc$summary, file = file.path(outputs_dir, "metrics_2pc_summary.csv"), row.names = FALSE)
# summary
metrics_compare <- data.frame(
model = c("original_variables", "first_2_pcs"),
accuracy = c(metrics_orig$summary$accuracy, metrics_2pc$summary$accuracy),
macro_precision = c(metrics_orig$summary$macro_precision, metrics_2pc$summary$macro_precision),
macro_recall = c(metrics_orig$summary$macro_recall, metrics_2pc$summary$macro_recall),
macro_f1 = c(metrics_orig$summary$macro_f1, metrics_2pc$summary$macro_f1)
)
write.csv(metrics_compare, file = file.path(outputs_dir, "metrics_comparison.csv"), row.names = FALSE)
# The below was made with help from ChatGPT because the psych package is confusing
if (!interactive() && has_ggplot2) {
pdf("Rplots_pca_fixed.pdf", width = 8, height = 6)
if (has_psych) {
psych::pairs.panels(
wine[,-1],
gap = 0,
bg = c("red","gold","royalblue")[wine$Type],
pch = 21,
main = "wine (uci) scatterplot matrix by class"
)
}
if (exists("p_scree") && inherits(p_scree, "ggplot")) print(p_scree)
if (exists("p_pc12") && inherits(p_pc12, "ggplot")) print(p_pc12)
dev.off()
}
+5
View File
@@ -0,0 +1,5 @@
pred
truth 1 2 3
1 15 2 0
2 1 19 1
3 0 1 15
+4
View File
@@ -0,0 +1,4 @@
"","1","2","3"
"1",15,2,0
"2",1,19,1
"3",0,1,15
1 1 2 3
2 1 15 2 0
3 2 1 19 1
4 3 0 1 15
+5
View File
@@ -0,0 +1,5 @@
pred
truth 1 2 3
1 17 0 0
2 1 18 2
3 0 0 16
@@ -0,0 +1,4 @@
"","1","2","3"
"1",17,0,0
"2",1,18,2
"3",0,0,16
1 1 2 3
2 1 17 0 0
3 2 1 18 2
4 3 0 0 16
+4
View File
@@ -0,0 +1,4 @@
"class","precision","recall","f1"
"1",0.9375,0.882352941176471,0.909090909090909
"2",0.863636363636364,0.904761904761905,0.883720930232558
"3",0.9375,0.9375,0.9375
1 class precision recall f1
2 1 0.9375 0.882352941176471 0.909090909090909
3 2 0.863636363636364 0.904761904761905 0.883720930232558
4 3 0.9375 0.9375 0.9375
+2
View File
@@ -0,0 +1,2 @@
"accuracy","macro_precision","macro_recall","macro_f1"
0.907407407407407,0.912878787878788,0.908204948646125,0.910103946441156
1 accuracy macro_precision macro_recall macro_f1
2 0.907407407407407 0.912878787878788 0.908204948646125 0.910103946441156
+3
View File
@@ -0,0 +1,3 @@
"model","accuracy","macro_precision","macro_recall","macro_f1"
"original_variables",0.944444444444444,0.944444444444444,0.952380952380952,0.94522732169791
"first_2_pcs",0.907407407407407,0.912878787878788,0.908204948646125,0.910103946441156
1 model accuracy macro_precision macro_recall macro_f1
2 original_variables 0.944444444444444 0.944444444444444 0.952380952380952 0.94522732169791
3 first_2_pcs 0.907407407407407 0.912878787878788 0.908204948646125 0.910103946441156
@@ -0,0 +1,4 @@
"class","precision","recall","f1"
"1",0.944444444444444,1,0.971428571428571
"2",1,0.857142857142857,0.923076923076923
"3",0.888888888888889,1,0.941176470588235
1 class precision recall f1
2 1 0.944444444444444 1 0.971428571428571
3 2 1 0.857142857142857 0.923076923076923
4 3 0.888888888888889 1 0.941176470588235
@@ -0,0 +1,2 @@
"accuracy","macro_precision","macro_recall","macro_f1"
0.944444444444444,0.944444444444444,0.952380952380952,0.94522732169791
1 accuracy macro_precision macro_recall macro_f1
2 0.944444444444444 0.944444444444444 0.952380952380952 0.94522732169791
Binary file not shown.

After

Width:  |  Height:  |  Size: 344 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 227 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 101 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 105 KiB

+6
View File
@@ -0,0 +1,6 @@
"Variable","PC1"
"Flavanoids",0.430570697054093
"Total_phenols",0.388556731445086
"OD280_OD315",0.379238757892512
"Proanthocyanins",0.318149910146199
"Nonflavanoid_phenols",-0.292569052362651
1 Variable PC1
2 Flavanoids 0.430570697054093
3 Total_phenols 0.388556731445086
4 OD280_OD315 0.379238757892512
5 Proanthocyanins 0.318149910146199
6 Nonflavanoid_phenols -0.292569052362651
+6
View File
@@ -0,0 +1,6 @@
"Variable","PC2"
"Color_intensity",-0.504116493512561
"Alcohol",-0.480328824227057
"Ash",-0.369020648548877
"Proline",-0.3555672525193
"Hue",0.300324646690879
1 Variable PC2
2 Color_intensity -0.504116493512561
3 Alcohol -0.480328824227057
4 Ash -0.369020648548877
5 Proline -0.3555672525193
6 Hue 0.300324646690879
+178
View File
@@ -0,0 +1,178 @@
1,14.23,1.71,2.43,15.6,127,2.8,3.06,.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,.26,1.28,4.38,1.05,3.4,1050
1,13.16,2.36,2.67,18.6,101,2.8,3.24,.3,2.81,5.68,1.03,3.17,1185
1,14.37,1.95,2.5,16.8,113,3.85,3.49,.24,2.18,7.8,.86,3.45,1480
1,13.24,2.59,2.87,21,118,2.8,2.69,.39,1.82,4.32,1.04,2.93,735
1,14.2,1.76,2.45,15.2,112,3.27,3.39,.34,1.97,6.75,1.05,2.85,1450
1,14.39,1.87,2.45,14.6,96,2.5,2.52,.3,1.98,5.25,1.02,3.58,1290
1,14.06,2.15,2.61,17.6,121,2.6,2.51,.31,1.25,5.05,1.06,3.58,1295
1,14.83,1.64,2.17,14,97,2.8,2.98,.29,1.98,5.2,1.08,2.85,1045
1,13.86,1.35,2.27,16,98,2.98,3.15,.22,1.85,7.22,1.01,3.55,1045
1,14.1,2.16,2.3,18,105,2.95,3.32,.22,2.38,5.75,1.25,3.17,1510
1,14.12,1.48,2.32,16.8,95,2.2,2.43,.26,1.57,5,1.17,2.82,1280
1,13.75,1.73,2.41,16,89,2.6,2.76,.29,1.81,5.6,1.15,2.9,1320
1,14.75,1.73,2.39,11.4,91,3.1,3.69,.43,2.81,5.4,1.25,2.73,1150
1,14.38,1.87,2.38,12,102,3.3,3.64,.29,2.96,7.5,1.2,3,1547
1,13.63,1.81,2.7,17.2,112,2.85,2.91,.3,1.46,7.3,1.28,2.88,1310
1,14.3,1.92,2.72,20,120,2.8,3.14,.33,1.97,6.2,1.07,2.65,1280
1,13.83,1.57,2.62,20,115,2.95,3.4,.4,1.72,6.6,1.13,2.57,1130
1,14.19,1.59,2.48,16.5,108,3.3,3.93,.32,1.86,8.7,1.23,2.82,1680
1,13.64,3.1,2.56,15.2,116,2.7,3.03,.17,1.66,5.1,.96,3.36,845
1,14.06,1.63,2.28,16,126,3,3.17,.24,2.1,5.65,1.09,3.71,780
1,12.93,3.8,2.65,18.6,102,2.41,2.41,.25,1.98,4.5,1.03,3.52,770
1,13.71,1.86,2.36,16.6,101,2.61,2.88,.27,1.69,3.8,1.11,4,1035
1,12.85,1.6,2.52,17.8,95,2.48,2.37,.26,1.46,3.93,1.09,3.63,1015
1,13.5,1.81,2.61,20,96,2.53,2.61,.28,1.66,3.52,1.12,3.82,845
1,13.05,2.05,3.22,25,124,2.63,2.68,.47,1.92,3.58,1.13,3.2,830
1,13.39,1.77,2.62,16.1,93,2.85,2.94,.34,1.45,4.8,.92,3.22,1195
1,13.3,1.72,2.14,17,94,2.4,2.19,.27,1.35,3.95,1.02,2.77,1285
1,13.87,1.9,2.8,19.4,107,2.95,2.97,.37,1.76,4.5,1.25,3.4,915
1,14.02,1.68,2.21,16,96,2.65,2.33,.26,1.98,4.7,1.04,3.59,1035
1,13.73,1.5,2.7,22.5,101,3,3.25,.29,2.38,5.7,1.19,2.71,1285
1,13.58,1.66,2.36,19.1,106,2.86,3.19,.22,1.95,6.9,1.09,2.88,1515
1,13.68,1.83,2.36,17.2,104,2.42,2.69,.42,1.97,3.84,1.23,2.87,990
1,13.76,1.53,2.7,19.5,132,2.95,2.74,.5,1.35,5.4,1.25,3,1235
1,13.51,1.8,2.65,19,110,2.35,2.53,.29,1.54,4.2,1.1,2.87,1095
1,13.48,1.81,2.41,20.5,100,2.7,2.98,.26,1.86,5.1,1.04,3.47,920
1,13.28,1.64,2.84,15.5,110,2.6,2.68,.34,1.36,4.6,1.09,2.78,880
1,13.05,1.65,2.55,18,98,2.45,2.43,.29,1.44,4.25,1.12,2.51,1105
1,13.07,1.5,2.1,15.5,98,2.4,2.64,.28,1.37,3.7,1.18,2.69,1020
1,14.22,3.99,2.51,13.2,128,3,3.04,.2,2.08,5.1,.89,3.53,760
1,13.56,1.71,2.31,16.2,117,3.15,3.29,.34,2.34,6.13,.95,3.38,795
1,13.41,3.84,2.12,18.8,90,2.45,2.68,.27,1.48,4.28,.91,3,1035
1,13.88,1.89,2.59,15,101,3.25,3.56,.17,1.7,5.43,.88,3.56,1095
1,13.24,3.98,2.29,17.5,103,2.64,2.63,.32,1.66,4.36,.82,3,680
1,13.05,1.77,2.1,17,107,3,3,.28,2.03,5.04,.88,3.35,885
1,14.21,4.04,2.44,18.9,111,2.85,2.65,.3,1.25,5.24,.87,3.33,1080
1,14.38,3.59,2.28,16,102,3.25,3.17,.27,2.19,4.9,1.04,3.44,1065
1,13.9,1.68,2.12,16,101,3.1,3.39,.21,2.14,6.1,.91,3.33,985
1,14.1,2.02,2.4,18.8,103,2.75,2.92,.32,2.38,6.2,1.07,2.75,1060
1,13.94,1.73,2.27,17.4,108,2.88,3.54,.32,2.08,8.90,1.12,3.1,1260
1,13.05,1.73,2.04,12.4,92,2.72,3.27,.17,2.91,7.2,1.12,2.91,1150
1,13.83,1.65,2.6,17.2,94,2.45,2.99,.22,2.29,5.6,1.24,3.37,1265
1,13.82,1.75,2.42,14,111,3.88,3.74,.32,1.87,7.05,1.01,3.26,1190
1,13.77,1.9,2.68,17.1,115,3,2.79,.39,1.68,6.3,1.13,2.93,1375
1,13.74,1.67,2.25,16.4,118,2.6,2.9,.21,1.62,5.85,.92,3.2,1060
1,13.56,1.73,2.46,20.5,116,2.96,2.78,.2,2.45,6.25,.98,3.03,1120
1,14.22,1.7,2.3,16.3,118,3.2,3,.26,2.03,6.38,.94,3.31,970
1,13.29,1.97,2.68,16.8,102,3,3.23,.31,1.66,6,1.07,2.84,1270
1,13.72,1.43,2.5,16.7,108,3.4,3.67,.19,2.04,6.8,.89,2.87,1285
2,12.37,.94,1.36,10.6,88,1.98,.57,.28,.42,1.95,1.05,1.82,520
2,12.33,1.1,2.28,16,101,2.05,1.09,.63,.41,3.27,1.25,1.67,680
2,12.64,1.36,2.02,16.8,100,2.02,1.41,.53,.62,5.75,.98,1.59,450
2,13.67,1.25,1.92,18,94,2.1,1.79,.32,.73,3.8,1.23,2.46,630
2,12.37,1.13,2.16,19,87,3.5,3.1,.19,1.87,4.45,1.22,2.87,420
2,12.17,1.45,2.53,19,104,1.89,1.75,.45,1.03,2.95,1.45,2.23,355
2,12.37,1.21,2.56,18.1,98,2.42,2.65,.37,2.08,4.6,1.19,2.3,678
2,13.11,1.01,1.7,15,78,2.98,3.18,.26,2.28,5.3,1.12,3.18,502
2,12.37,1.17,1.92,19.6,78,2.11,2,.27,1.04,4.68,1.12,3.48,510
2,13.34,.94,2.36,17,110,2.53,1.3,.55,.42,3.17,1.02,1.93,750
2,12.21,1.19,1.75,16.8,151,1.85,1.28,.14,2.5,2.85,1.28,3.07,718
2,12.29,1.61,2.21,20.4,103,1.1,1.02,.37,1.46,3.05,.906,1.82,870
2,13.86,1.51,2.67,25,86,2.95,2.86,.21,1.87,3.38,1.36,3.16,410
2,13.49,1.66,2.24,24,87,1.88,1.84,.27,1.03,3.74,.98,2.78,472
2,12.99,1.67,2.6,30,139,3.3,2.89,.21,1.96,3.35,1.31,3.5,985
2,11.96,1.09,2.3,21,101,3.38,2.14,.13,1.65,3.21,.99,3.13,886
2,11.66,1.88,1.92,16,97,1.61,1.57,.34,1.15,3.8,1.23,2.14,428
2,13.03,.9,1.71,16,86,1.95,2.03,.24,1.46,4.6,1.19,2.48,392
2,11.84,2.89,2.23,18,112,1.72,1.32,.43,.95,2.65,.96,2.52,500
2,12.33,.99,1.95,14.8,136,1.9,1.85,.35,2.76,3.4,1.06,2.31,750
2,12.7,3.87,2.4,23,101,2.83,2.55,.43,1.95,2.57,1.19,3.13,463
2,12,.92,2,19,86,2.42,2.26,.3,1.43,2.5,1.38,3.12,278
2,12.72,1.81,2.2,18.8,86,2.2,2.53,.26,1.77,3.9,1.16,3.14,714
2,12.08,1.13,2.51,24,78,2,1.58,.4,1.4,2.2,1.31,2.72,630
2,13.05,3.86,2.32,22.5,85,1.65,1.59,.61,1.62,4.8,.84,2.01,515
2,11.84,.89,2.58,18,94,2.2,2.21,.22,2.35,3.05,.79,3.08,520
2,12.67,.98,2.24,18,99,2.2,1.94,.3,1.46,2.62,1.23,3.16,450
2,12.16,1.61,2.31,22.8,90,1.78,1.69,.43,1.56,2.45,1.33,2.26,495
2,11.65,1.67,2.62,26,88,1.92,1.61,.4,1.34,2.6,1.36,3.21,562
2,11.64,2.06,2.46,21.6,84,1.95,1.69,.48,1.35,2.8,1,2.75,680
2,12.08,1.33,2.3,23.6,70,2.2,1.59,.42,1.38,1.74,1.07,3.21,625
2,12.08,1.83,2.32,18.5,81,1.6,1.5,.52,1.64,2.4,1.08,2.27,480
2,12,1.51,2.42,22,86,1.45,1.25,.5,1.63,3.6,1.05,2.65,450
2,12.69,1.53,2.26,20.7,80,1.38,1.46,.58,1.62,3.05,.96,2.06,495
2,12.29,2.83,2.22,18,88,2.45,2.25,.25,1.99,2.15,1.15,3.3,290
2,11.62,1.99,2.28,18,98,3.02,2.26,.17,1.35,3.25,1.16,2.96,345
2,12.47,1.52,2.2,19,162,2.5,2.27,.32,3.28,2.6,1.16,2.63,937
2,11.81,2.12,2.74,21.5,134,1.6,.99,.14,1.56,2.5,.95,2.26,625
2,12.29,1.41,1.98,16,85,2.55,2.5,.29,1.77,2.9,1.23,2.74,428
2,12.37,1.07,2.1,18.5,88,3.52,3.75,.24,1.95,4.5,1.04,2.77,660
2,12.29,3.17,2.21,18,88,2.85,2.99,.45,2.81,2.3,1.42,2.83,406
2,12.08,2.08,1.7,17.5,97,2.23,2.17,.26,1.4,3.3,1.27,2.96,710
2,12.6,1.34,1.9,18.5,88,1.45,1.36,.29,1.35,2.45,1.04,2.77,562
2,12.34,2.45,2.46,21,98,2.56,2.11,.34,1.31,2.8,.8,3.38,438
2,11.82,1.72,1.88,19.5,86,2.5,1.64,.37,1.42,2.06,.94,2.44,415
2,12.51,1.73,1.98,20.5,85,2.2,1.92,.32,1.48,2.94,1.04,3.57,672
2,12.42,2.55,2.27,22,90,1.68,1.84,.66,1.42,2.7,.86,3.3,315
2,12.25,1.73,2.12,19,80,1.65,2.03,.37,1.63,3.4,1,3.17,510
2,12.72,1.75,2.28,22.5,84,1.38,1.76,.48,1.63,3.3,.88,2.42,488
2,12.22,1.29,1.94,19,92,2.36,2.04,.39,2.08,2.7,.86,3.02,312
2,11.61,1.35,2.7,20,94,2.74,2.92,.29,2.49,2.65,.96,3.26,680
2,11.46,3.74,1.82,19.5,107,3.18,2.58,.24,3.58,2.9,.75,2.81,562
2,12.52,2.43,2.17,21,88,2.55,2.27,.26,1.22,2,.9,2.78,325
2,11.76,2.68,2.92,20,103,1.75,2.03,.6,1.05,3.8,1.23,2.5,607
2,11.41,.74,2.5,21,88,2.48,2.01,.42,1.44,3.08,1.1,2.31,434
2,12.08,1.39,2.5,22.5,84,2.56,2.29,.43,1.04,2.9,.93,3.19,385
2,11.03,1.51,2.2,21.5,85,2.46,2.17,.52,2.01,1.9,1.71,2.87,407
2,11.82,1.47,1.99,20.8,86,1.98,1.6,.3,1.53,1.95,.95,3.33,495
2,12.42,1.61,2.19,22.5,108,2,2.09,.34,1.61,2.06,1.06,2.96,345
2,12.77,3.43,1.98,16,80,1.63,1.25,.43,.83,3.4,.7,2.12,372
2,12,3.43,2,19,87,2,1.64,.37,1.87,1.28,.93,3.05,564
2,11.45,2.4,2.42,20,96,2.9,2.79,.32,1.83,3.25,.8,3.39,625
2,11.56,2.05,3.23,28.5,119,3.18,5.08,.47,1.87,6,.93,3.69,465
2,12.42,4.43,2.73,26.5,102,2.2,2.13,.43,1.71,2.08,.92,3.12,365
2,13.05,5.8,2.13,21.5,86,2.62,2.65,.3,2.01,2.6,.73,3.1,380
2,11.87,4.31,2.39,21,82,2.86,3.03,.21,2.91,2.8,.75,3.64,380
2,12.07,2.16,2.17,21,85,2.6,2.65,.37,1.35,2.76,.86,3.28,378
2,12.43,1.53,2.29,21.5,86,2.74,3.15,.39,1.77,3.94,.69,2.84,352
2,11.79,2.13,2.78,28.5,92,2.13,2.24,.58,1.76,3,.97,2.44,466
2,12.37,1.63,2.3,24.5,88,2.22,2.45,.4,1.9,2.12,.89,2.78,342
2,12.04,4.3,2.38,22,80,2.1,1.75,.42,1.35,2.6,.79,2.57,580
3,12.86,1.35,2.32,18,122,1.51,1.25,.21,.94,4.1,.76,1.29,630
3,12.88,2.99,2.4,20,104,1.3,1.22,.24,.83,5.4,.74,1.42,530
3,12.81,2.31,2.4,24,98,1.15,1.09,.27,.83,5.7,.66,1.36,560
3,12.7,3.55,2.36,21.5,106,1.7,1.2,.17,.84,5,.78,1.29,600
3,12.51,1.24,2.25,17.5,85,2,.58,.6,1.25,5.45,.75,1.51,650
3,12.6,2.46,2.2,18.5,94,1.62,.66,.63,.94,7.1,.73,1.58,695
3,12.25,4.72,2.54,21,89,1.38,.47,.53,.8,3.85,.75,1.27,720
3,12.53,5.51,2.64,25,96,1.79,.6,.63,1.1,5,.82,1.69,515
3,13.49,3.59,2.19,19.5,88,1.62,.48,.58,.88,5.7,.81,1.82,580
3,12.84,2.96,2.61,24,101,2.32,.6,.53,.81,4.92,.89,2.15,590
3,12.93,2.81,2.7,21,96,1.54,.5,.53,.75,4.6,.77,2.31,600
3,13.36,2.56,2.35,20,89,1.4,.5,.37,.64,5.6,.7,2.47,780
3,13.52,3.17,2.72,23.5,97,1.55,.52,.5,.55,4.35,.89,2.06,520
3,13.62,4.95,2.35,20,92,2,.8,.47,1.02,4.4,.91,2.05,550
3,12.25,3.88,2.2,18.5,112,1.38,.78,.29,1.14,8.21,.65,2,855
3,13.16,3.57,2.15,21,102,1.5,.55,.43,1.3,4,.6,1.68,830
3,13.88,5.04,2.23,20,80,.98,.34,.4,.68,4.9,.58,1.33,415
3,12.87,4.61,2.48,21.5,86,1.7,.65,.47,.86,7.65,.54,1.86,625
3,13.32,3.24,2.38,21.5,92,1.93,.76,.45,1.25,8.42,.55,1.62,650
3,13.08,3.9,2.36,21.5,113,1.41,1.39,.34,1.14,9.40,.57,1.33,550
3,13.5,3.12,2.62,24,123,1.4,1.57,.22,1.25,8.60,.59,1.3,500
3,12.79,2.67,2.48,22,112,1.48,1.36,.24,1.26,10.8,.48,1.47,480
3,13.11,1.9,2.75,25.5,116,2.2,1.28,.26,1.56,7.1,.61,1.33,425
3,13.23,3.3,2.28,18.5,98,1.8,.83,.61,1.87,10.52,.56,1.51,675
3,12.58,1.29,2.1,20,103,1.48,.58,.53,1.4,7.6,.58,1.55,640
3,13.17,5.19,2.32,22,93,1.74,.63,.61,1.55,7.9,.6,1.48,725
3,13.84,4.12,2.38,19.5,89,1.8,.83,.48,1.56,9.01,.57,1.64,480
3,12.45,3.03,2.64,27,97,1.9,.58,.63,1.14,7.5,.67,1.73,880
3,14.34,1.68,2.7,25,98,2.8,1.31,.53,2.7,13,.57,1.96,660
3,13.48,1.67,2.64,22.5,89,2.6,1.1,.52,2.29,11.75,.57,1.78,620
3,12.36,3.83,2.38,21,88,2.3,.92,.5,1.04,7.65,.56,1.58,520
3,13.69,3.26,2.54,20,107,1.83,.56,.5,.8,5.88,.96,1.82,680
3,12.85,3.27,2.58,22,106,1.65,.6,.6,.96,5.58,.87,2.11,570
3,12.96,3.45,2.35,18.5,106,1.39,.7,.4,.94,5.28,.68,1.75,675
3,13.78,2.76,2.3,22,90,1.35,.68,.41,1.03,9.58,.7,1.68,615
3,13.73,4.36,2.26,22.5,88,1.28,.47,.52,1.15,6.62,.78,1.75,520
3,13.45,3.7,2.6,23,111,1.7,.92,.43,1.46,10.68,.85,1.56,695
3,12.82,3.37,2.3,19.5,88,1.48,.66,.4,.97,10.26,.72,1.75,685
3,13.58,2.58,2.69,24.5,105,1.55,.84,.39,1.54,8.66,.74,1.8,750
3,13.4,4.6,2.86,25,112,1.98,.96,.27,1.11,8.5,.67,1.92,630
3,12.2,3.03,2.32,19,96,1.25,.49,.4,.73,5.5,.66,1.83,510
3,12.77,2.39,2.28,19.5,86,1.39,.51,.48,.64,9.899999,.57,1.63,470
3,14.16,2.51,2.48,20,91,1.68,.7,.44,1.24,9.7,.62,1.71,660
3,13.71,5.65,2.45,20.5,95,1.68,.61,.52,1.06,7.7,.64,1.74,740
3,13.4,3.91,2.48,23,102,1.8,.75,.43,1.41,7.3,.7,1.56,750
3,13.27,4.28,2.26,20,120,1.59,.69,.43,1.35,10.2,.59,1.56,835
3,13.17,2.59,2.37,20,120,1.65,.68,.53,1.46,9.3,.6,1.62,840
3,14.13,4.1,2.74,24.5,96,2.05,.76,.56,1.35,9.2,.61,1.6,560
+100
View File
@@ -0,0 +1,100 @@
1. Title of Database: Wine recognition data
Updated Sept 21, 1998 by C.Blake : Added attribute information
2. Sources:
(a) Forina, M. et al, PARVUS - An Extendible Package for Data
Exploration, Classification and Correlation. Institute of Pharmaceutical
and Food Analysis and Technologies, Via Brigata Salerno,
16147 Genoa, Italy.
(b) Stefan Aeberhard, email: stefan@coral.cs.jcu.edu.au
(c) July 1991
3. Past Usage:
(1)
S. Aeberhard, D. Coomans and O. de Vel,
Comparison of Classifiers in High Dimensional Settings,
Tech. Rep. no. 92-02, (1992), Dept. of Computer Science and Dept. of
Mathematics and Statistics, James Cook University of North Queensland.
(Also submitted to Technometrics).
The data was used with many others for comparing various
classifiers. The classes are separable, though only RDA
has achieved 100% correct classification.
(RDA : 100%, QDA 99.4%, LDA 98.9%, 1NN 96.1% (z-transformed data))
(All results using the leave-one-out technique)
In a classification context, this is a well posed problem
with "well behaved" class structures. A good data set
for first testing of a new classifier, but not very
challenging.
(2)
S. Aeberhard, D. Coomans and O. de Vel,
"THE CLASSIFICATION PERFORMANCE OF RDA"
Tech. Rep. no. 92-01, (1992), Dept. of Computer Science and Dept. of
Mathematics and Statistics, James Cook University of North Queensland.
(Also submitted to Journal of Chemometrics).
Here, the data was used to illustrate the superior performance of
the use of a new appreciation function with RDA.
4. Relevant Information:
-- These data are the results of a chemical analysis of
wines grown in the same region in Italy but derived from three
different cultivars.
The analysis determined the quantities of 13 constituents
found in each of the three types of wines.
-- I think that the initial data set had around 30 variables, but
for some reason I only have the 13 dimensional version.
I had a list of what the 30 or so variables were, but a.)
I lost it, and b.), I would not know which 13 variables
are included in the set.
-- The attributes are (dontated by Riccardo Leardi,
riclea@anchem.unige.it )
1) Alcohol
2) Malic acid
3) Ash
4) Alcalinity of ash
5) Magnesium
6) Total phenols
7) Flavanoids
8) Nonflavanoid phenols
9) Proanthocyanins
10)Color intensity
11)Hue
12)OD280/OD315 of diluted wines
13)Proline
5. Number of Instances
class 1 59
class 2 71
class 3 48
6. Number of Attributes
13
7. For Each Attribute:
All attributes are continuous
No statistics available, but suggest to standardise
variables for certain uses (e.g. for us with classifiers
which are NOT scale invariant)
NOTE: 1st attribute is class identifier (1-3)
8. Missing Attribute Values:
None
9. Class Distribution: number of instances per class
class 1 59
class 2 71
class 3 48