added lab 4

transfer
2025-10-31 17:55:13 -04:00 · 2025-10-17 09:26:49 -04:00
60 changed files with 730 additions and 5 deletions
@@ -1,5 +0,0 @@
-node_modules
-.venv
-.vscode
-Assignment III
-tmp/
@@ -0,0 +1,41 @@
+##########################################
+### Principal Component Analysis (PCA) ###
+##########################################
+
+## load libraries
+library(ggplot2)
+library(ggfortify)
+library(GGally)
+library(e1071)
+library(class)
+library(psych)
+library(readr)
+
+## set working directory so that files can be referenced without the full path
+setwd("/home/ion606/Desktop/Data Analytics/Lab 4")
+
+## read dataset
+wine <- read_csv("wine.data", col_names = FALSE)
+
+## set column names
+names(wine) <- c("Type","Alcohol","Malic acid","Ash","Alcalinity of ash","Magnesium","Total phenols","Flavanoids","Nonflavanoid Phenols","Proanthocyanins","Color Intensity","Hue","Od280/od315 of diluted wines","Proline")
+
+## inspect data frame
+head(wine)
+
+## change the data type of the "Type" column from character to factor
+####
+# Factors look like regular strings (characters) but with factors R knows 
+# that the column is a categorical variable with finite possible values
+# e.g. "Type" in the Wine dataset can only be 1, 2, or 3
+####
+
+wine$Type <- as.factor(wine$Type)
+
+
+## visualize variables
+pairs.panels(wine[,-1],gap = 0,bg = c("red", "yellow", "blue")[wine$Type],pch=21)
+
+ggpairs(wine, ggplot2::aes(colour = Type))
+
+###
@@ -0,0 +1,366 @@
+has_pkg <- function(pkg) requireNamespace(pkg, quietly = TRUE)
+
+has_ggplot2 <- has_pkg("ggplot2")
+has_GGally  <- has_pkg("GGally")
+has_e1071   <- has_pkg("e1071")
+has_class   <- has_pkg("class")
+has_psych   <- has_pkg("psych")
+has_readr   <- has_pkg("readr")
+
+# WHY IS THIS HERE YOU MIGHT ASK???? WELL LET ME TELL YOU I SPENT TWO HOURS ON STUPID PACKAGE IMPORTS
+# OOOOOOHHH PSYCH IS IN A DIFFERENT REPO??? OH IT ISN'T??? I have a fever of 103 I DO NOT CARE
+if (has_ggplot2) { library(ggplot2) } else { warning("ggplot2 not available; plots will be skipped") }
+if (has_GGally)  { library(GGally)  } else { message("GGally not available; skipping ggpairs plot") }
+if (has_e1071)   { library(e1071)   }
+if (has_class)   { library(class)   } else { stop("class package not available for kNN") }
+if (!has_psych)  { message("psych not available; skipping pairs.panels plot") }
+if (has_readr)   { library(readr)   }
+library(grid)        # unit() for arrows in plots
+suppressWarnings(RNGkind(sample.kind = "Rounding"))
+
+# set a reproducible seed
+set.seed(4600)
+
+# 178 rows
+# col 1 is class label (1,2,3)
+# other 13 columns continuous predictors
+
+possible_paths <- c(
+  "wine.data",
+  "./wine.data",
+  "../wine.data",
+  "DAN/wine.data",
+  "./DAN/wine.data"
+)
+data_path <- NA
+for (p in possible_paths) { if (file.exists(p)) { data_path <- p; break } }
+if (is.na(data_path)) stop("could not find wine.data; place this script in the DAN folder or given/ and re-run")
+
+if (has_readr) {
+  wine <- readr::read_csv(
+    file = data_path,
+    col_names = FALSE,
+    show_col_types = FALSE,
+    progress = FALSE
+  )
+} else {
+  wine <- read.csv(file = data_path, header = FALSE)
+}
+
+colnames(wine) <- c(
+  "Type",
+  "Alcohol",
+  "Malic_acid",
+  "Ash",
+  "Alcalinity_of_ash",
+  "Magnesium",
+  "Total_phenols",
+  "Flavanoids",
+  "Nonflavanoid_phenols",
+  "Proanthocyanins",
+  "Color_intensity",
+  "Hue",
+  "OD280_OD315",
+  "Proline"
+)
+
+wine$Type <- as.factor(wine$Type)
+
+# put here from when I accidentally read in the wrong file repeatedly
+# left because it makes it more, "robust"
+stopifnot(nrow(wine) == 178, ncol(wine) == 14)
+print(summary(wine$Type))
+
+# exploratory plots (because I went down a rabbit hole and by god I'm using it)
+
+if (has_psych) {
+  # pairs panel (psych) – colors by class
+  psych::pairs.panels(
+    wine[,-1],
+    gap = 0,
+    bg = c("red","gold","royalblue")[wine$Type],
+    pch = 21,
+    main = "wine (uci) – scatterplot matrix by class"
+  )
+}
+
+if (has_GGally && has_ggplot2) {
+  # ggpairs for nice matrix <3
+  GGally::ggpairs(wine, ggplot2::aes(colour = Type), columns = 2:ncol(wine))
+}
+
+# split into train/test BEFORE!!!!!!!!!!!!!!!!!!!!!! any preprocessing to avoid leakage
+
+set.seed(4600)
+n <- nrow(wine)
+train_idx <- sample.int(n, size = floor(0.7 * n))
+wine_train <- wine[train_idx, , drop = FALSE]
+wine_test  <- wine[-train_idx, , drop = FALSE]
+
+X_train <- wine_train[, -1]
+y_train <- wine_train$Type
+X_test  <- wine_test[, -1]
+y_test  <- wine_test$Type
+
+# yes
+if (any(sapply(X_train, function(x) var(x, na.rm = TRUE) == 0))) {
+  warning("one or more predictors have zero variance in the training set; scale() would fail")
+}
+if (anyNA(X_train) | anyNA(X_test)) {
+  stop("found NA values in predictors; handle missingness before PCA")
+}
+
+# project both train and test using the train-fitted pca
+pca_tr <- prcomp(X_train, center = TRUE, scale. = TRUE)
+
+pve_tr <- (pca_tr$sdev^2) / sum(pca_tr$sdev^2)
+pve_df <- data.frame(
+  PC = paste0("PC", seq_along(pve_tr)),
+  PVE = pve_tr,
+  CumPVE = cumsum(pve_tr)
+)
+
+print("variance explained (training pca):")
+print(pve_df)
+
+# scree plot from training pca
+p_scree <- ggplot(pve_df, aes(x = seq_along(PVE), y = PVE)) +
+  geom_line() + geom_point() +
+  scale_x_continuous(breaks = 1:length(pve_df$PC), labels = pve_df$PC) +
+  labs(title = "scree plot – variance explained by principal components (training pca)",
+       x = "principal component", y = "proportion of variance explained") +
+  theme_minimal()
+
+# cumulative variance plot from training pca
+p_cumvar <- ggplot(pve_df, aes(x = seq_along(CumPVE), y = CumPVE)) +
+  geom_line() + geom_point() +
+  scale_x_continuous(breaks = 1:length(pve_df$PC), labels = pve_df$PC) +
+  labs(title = "cumulative variance explained (training pca)",
+       x = "principal component", y = "cumulative proportion of variance") +
+  theme_minimal()
+
+# ========================================================================================================
+
+# choose number of pcs: default to the smallest k with >= thresh cum variance
+# you can change thresh to 0.90 or 0.99 if you prefer
+
+pc_variance_threshold <- 0.95
+k_pcs <- which(cumsum(pve_tr) >= pc_variance_threshold)[1]
+if (is.na(k_pcs)) k_pcs <- ncol(X_train)  # crashes if fails so...
+cat("chosen number of pcs (threshold =", pc_variance_threshold, "):", k_pcs, "\n")
+
+# project train/test into the pca space
+Z_train_full <- as.data.frame(predict(pca_tr, newdata = X_train))
+Z_test_full  <- as.data.frame(predict(pca_tr, newdata = X_test))
+
+# for downstream modeling
+Z_train <- Z_train_full[, seq_len(k_pcs), drop = FALSE]
+Z_test  <- Z_test_full[,  seq_len(k_pcs), drop = FALSE]
+
+scores_all <- as.data.frame(predict(pca_tr, newdata = wine[,-1]))
+scores_all$Type <- wine$Type
+
+# loadings from training pca
+loadings <- as.data.frame(pca_tr$rotation)
+loadings$Variable <- rownames(loadings)
+top_pc1 <- loadings[order(abs(loadings$PC1), decreasing = TRUE), c("Variable","PC1")][1:5, ]
+top_pc2 <- loadings[order(abs(loadings$PC2), decreasing = TRUE), c("Variable","PC2")][1:5, ]
+print("top contributors to pc1 (training pca):"); print(top_pc1)
+print("top contributors to pc2 (training pca):"); print(top_pc2)
+
+
+# function to make convex hull data for each group
+scores <- scores_all
+hull_df <- do.call(rbind, lapply(split(scores, scores$Type), function(df) {
+  pts <- df[chull(df$PC1, df$PC2), c("PC1","PC2")]
+  pts$Type <- unique(df$Type)
+  pts
+}))
+p_pc12 <- ggplot(scores, aes(PC1, PC2, color = Type)) +
+  geom_point(size = 2, alpha = 0.85) +
+  geom_polygon(data = hull_df, aes(fill = Type, group = Type), color = NA, alpha = 0.15) +
+  guides(fill = "none") +
+  theme_minimal() +
+  labs(title = "pc1 vs pc2 by class (projected with training pca)")
+
+# arrow arrow arrow arrow arrow arrow arrow arrow arrow
+loading_scalefactor <- 3 * max(abs(scores$PC1), abs(scores$PC2)) # heuristic
+load_plot_df <- loadings
+load_plot_df$PC1s <- load_plot_df$PC1 * loading_scalefactor
+load_plot_df$PC2s <- load_plot_df$PC2 * loading_scalefactor
+
+p_biplot <- ggplot(scores, aes(PC1, PC2, color = Type)) +
+  geom_point(size = 2, alpha = 0.85) +
+  geom_segment(
+    data = load_plot_df,
+    mapping = aes(x = 0, y = 0, xend = PC1s, yend = PC2s),
+    inherit.aes = FALSE,
+    arrow = arrow(length = unit(0.02, "npc")),
+    color = "black",
+    alpha = 0.8
+  ) +
+  geom_text(
+    data = load_plot_df,
+    mapping = aes(x = PC1s, y = PC2s, label = Variable),
+    inherit.aes = FALSE,
+    hjust = 0,
+    vjust = 0
+  ) +
+  theme_minimal() +
+  labs(title = "pc1 vs pc2 with variable loadings (training pca projection)")
+
+# 1) kNN on original variables with standardization
+# 2) kNN on first 2 principal components only
+
+# helper to create metrics from a confusion matrix (rows=true, cols=pred)
+compute_metrics <- function(cm) {
+  lv <- rownames(cm)
+  if (is.null(lv)) lv <- as.character(1:nrow(cm))
+  TP <- diag(cm)
+  FP <- colSums(cm) - TP
+  FN <- rowSums(cm) - TP
+  precision <- TP / (TP + FP)
+  recall <- TP / (TP + FN)
+  f1 <- 2 * precision * recall / (precision + recall)
+  acc <- sum(TP) / sum(cm)
+  macro_precision <- mean(precision, na.rm = TRUE)
+  macro_recall <- mean(recall, na.rm = TRUE)
+  macro_f1 <- mean(f1, na.rm = TRUE)
+  per_class <- data.frame(
+    class = lv,
+    precision = precision,
+    recall = recall,
+    f1 = f1,
+    row.names = NULL
+  )
+  summary <- data.frame(
+    accuracy = acc,
+    macro_precision = macro_precision,
+    macro_recall = macro_recall,
+    macro_f1 = macro_f1
+  )
+  list(per_class = per_class, summary = summary)
+}
+
+set.seed(4600)
+ks <- seq(1, 15, by = 2)
+Kfolds <- 5
+
+# kNN on original vars
+X_train_scaled <- scale(X_train, center = TRUE, scale = TRUE)
+scale_center <- attr(X_train_scaled, "scaled:center")
+scale_scale <- attr(X_train_scaled, "scaled:scale")
+X_test_scaled <- scale(X_test, center = scale_center, scale = scale_scale)
+
+n_train_orig <- nrow(X_train_scaled)
+folds_orig <- sample(rep(1:Kfolds, length.out = n_train_orig))
+cv_acc_orig <- sapply(ks, function(k) {
+  mean(sapply(1:Kfolds, function(f) {
+    tr <- which(folds_orig != f)
+    va <- which(folds_orig == f)
+    pred_cv <- knn(train = X_train_scaled[tr, , drop = FALSE],
+                   test = X_train_scaled[va, , drop = FALSE],
+                   cl = y_train[tr], k = k)
+    mean(pred_cv == y_train[va])
+  }))
+})
+
+best_k_orig <- ks[which.max(cv_acc_orig)]
+cat("[Original vars] best k:", best_k_orig, "cv acc:", max(cv_acc_orig), "\n")
+
+pred_orig <- knn(train = X_train_scaled, test = X_test_scaled, cl = y_train, k = best_k_orig)
+acc_orig <- mean(pred_orig == y_test)
+cm_orig <- table(truth = y_test, pred = pred_orig)
+
+cat("[Original vars] held-out accuracy:", round(acc_orig, 4), "\n")
+print(cm_orig)
+
+metrics_orig <- compute_metrics(cm_orig)
+print(metrics_orig$summary)
+print(metrics_orig$per_class)
+
+# kNN on first 2 PCs only
+Z2_train <- Z_train_full[, 1:2, drop = FALSE]
+Z2_test  <- Z_test_full[, 1:2, drop = FALSE]
+n_train_2pc <- nrow(Z2_train)
+
+folds_2pc <- sample(rep(1:Kfolds, length.out = n_train_2pc))
+cv_acc_2pc <- sapply(ks, function(k) {
+  mean(sapply(1:Kfolds, function(f) {
+    tr <- which(folds_2pc != f)
+    va <- which(folds_2pc == f)
+    pred_cv <- knn(train = Z2_train[tr, , drop = FALSE],
+                   test = Z2_train[va, , drop = FALSE],
+                   cl = y_train[tr], k = k)
+    mean(pred_cv == y_train[va])
+  }))
+})
+
+best_k_2pc <- ks[which.max(cv_acc_2pc)]
+cat("[First 2 PCs] best k:", best_k_2pc, "cv acc:", max(cv_acc_2pc), "\n")
+
+pred_2pc <- knn(train = Z2_train, test = Z2_test, cl = y_train, k = best_k_2pc)
+acc_2pc <- mean(pred_2pc == y_test)
+cm_2pc <- table(truth = y_test, pred = pred_2pc)
+
+cat("[First 2 PCs] held-out accuracy:", round(acc_2pc, 4), "\n")
+print(cm_2pc)
+
+metrics_2pc <- compute_metrics(cm_2pc)
+print(metrics_2pc$summary)
+print(metrics_2pc$per_class)
+
+# ===========================================================================================
+outputs_dir <- "outputs"
+if (!dir.exists(outputs_dir)) dir.create(outputs_dir, recursive = TRUE, showWarnings = FALSE)
+
+# plots
+if (exists("p_pc12") && inherits(p_pc12, "ggplot")) ggsave(filename = file.path(outputs_dir, "pc12_scatter.png"), plot = p_pc12, width = 8, height = 6, dpi = 300)
+if (exists("p_biplot") && inherits(p_biplot, "ggplot")) ggsave(filename = file.path(outputs_dir, "pc12_biplot.png"),  plot = p_biplot, width = 8, height = 6, dpi = 300)
+if (exists("p_scree") && inherits(p_scree, "ggplot")) ggsave(filename = file.path(outputs_dir, "pca_scree.png"),    plot = p_scree, width = 8, height = 6, dpi = 300)
+if (exists("p_cumvar") && inherits(p_cumvar, "ggplot")) ggsave(filename = file.path(outputs_dir, "pca_cumvar.png"),   plot = p_cumvar, width = 8, height = 6, dpi = 300)
+
+# top contributors/vars to PC1 and PC2
+write.csv(top_pc1, file = file.path(outputs_dir, "top_contributors_pc1.csv"), row.names = FALSE)
+write.csv(top_pc2, file = file.path(outputs_dir, "top_contributors_pc2.csv"), row.names = FALSE)
+
+# confusion matrices as wide CSV and pretty text
+write.csv(as.matrix(cm_orig), file = file.path(outputs_dir, "confusion_original_wide.csv"))
+writeLines(capture.output(cm_orig), con = file.path(outputs_dir, "confusion_original.txt"))
+
+write.csv(as.matrix(cm_2pc), file = file.path(outputs_dir, "confusion_2pc_wide.csv"))
+writeLines(capture.output(cm_2pc), con = file.path(outputs_dir, "confusion_2pc.txt"))
+
+# metrics
+write.csv(metrics_orig$per_class, file = file.path(outputs_dir, "metrics_original_per_class.csv"), row.names = FALSE)
+write.csv(metrics_orig$summary,   file = file.path(outputs_dir, "metrics_original_summary.csv"), row.names = FALSE)
+write.csv(metrics_2pc$per_class,  file = file.path(outputs_dir, "metrics_2pc_per_class.csv"), row.names = FALSE)
+write.csv(metrics_2pc$summary,    file = file.path(outputs_dir, "metrics_2pc_summary.csv"), row.names = FALSE)
+
+# summary
+metrics_compare <- data.frame(
+  model = c("original_variables", "first_2_pcs"),
+  accuracy = c(metrics_orig$summary$accuracy, metrics_2pc$summary$accuracy),
+  macro_precision = c(metrics_orig$summary$macro_precision, metrics_2pc$summary$macro_precision),
+  macro_recall = c(metrics_orig$summary$macro_recall, metrics_2pc$summary$macro_recall),
+  macro_f1 = c(metrics_orig$summary$macro_f1, metrics_2pc$summary$macro_f1)
+)
+write.csv(metrics_compare, file = file.path(outputs_dir, "metrics_comparison.csv"), row.names = FALSE)
+
+# The below was made with help from ChatGPT because the psych package is confusing
+if (!interactive() && has_ggplot2) {
+  pdf("Rplots_pca_fixed.pdf", width = 8, height = 6)
+  if (has_psych) {
+    psych::pairs.panels(
+      wine[,-1],
+      gap = 0,
+      bg = c("red","gold","royalblue")[wine$Type],
+      pch = 21,
+      main = "wine (uci) – scatterplot matrix by class"
+    )
+  }
+
+  if (exists("p_scree") && inherits(p_scree, "ggplot")) print(p_scree)
+  if (exists("p_pc12") && inherits(p_pc12, "ggplot")) print(p_pc12)
+  dev.off()
+}
@@ -0,0 +1,5 @@
+     pred
+truth  1  2  3
+    1 15  2  0
+    2  1 19  1
+    3  0  1 15
@@ -0,0 +1,4 @@
+"","1","2","3"
+"1",15,2,0
+"2",1,19,1
+"3",0,1,15
@@ -0,0 +1,5 @@
+     pred
+truth  1  2  3
+    1 17  0  0
+    2  1 18  2
+    3  0  0 16
@@ -0,0 +1,4 @@
+"","1","2","3"
+"1",17,0,0
+"2",1,18,2
+"3",0,0,16
@@ -0,0 +1,4 @@
+"class","precision","recall","f1"
+"1",0.9375,0.882352941176471,0.909090909090909
+"2",0.863636363636364,0.904761904761905,0.883720930232558
+"3",0.9375,0.9375,0.9375
@@ -0,0 +1,2 @@
+"accuracy","macro_precision","macro_recall","macro_f1"
+0.907407407407407,0.912878787878788,0.908204948646125,0.910103946441156
@@ -0,0 +1,3 @@
+"model","accuracy","macro_precision","macro_recall","macro_f1"
+"original_variables",0.944444444444444,0.944444444444444,0.952380952380952,0.94522732169791
+"first_2_pcs",0.907407407407407,0.912878787878788,0.908204948646125,0.910103946441156
@@ -0,0 +1,4 @@
+"class","precision","recall","f1"
+"1",0.944444444444444,1,0.971428571428571
+"2",1,0.857142857142857,0.923076923076923
+"3",0.888888888888889,1,0.941176470588235
@@ -0,0 +1,2 @@
+"accuracy","macro_precision","macro_recall","macro_f1"
+0.944444444444444,0.944444444444444,0.952380952380952,0.94522732169791
@@ -0,0 +1,6 @@
+"Variable","PC1"
+"Flavanoids",0.430570697054093
+"Total_phenols",0.388556731445086
+"OD280_OD315",0.379238757892512
+"Proanthocyanins",0.318149910146199
+"Nonflavanoid_phenols",-0.292569052362651
@@ -0,0 +1,6 @@
+"Variable","PC2"
+"Color_intensity",-0.504116493512561
+"Alcohol",-0.480328824227057
+"Ash",-0.369020648548877
+"Proline",-0.3555672525193
+"Hue",0.300324646690879
@@ -0,0 +1,178 @@
+1,14.23,1.71,2.43,15.6,127,2.8,3.06,.28,2.29,5.64,1.04,3.92,1065
+1,13.2,1.78,2.14,11.2,100,2.65,2.76,.26,1.28,4.38,1.05,3.4,1050
+1,13.16,2.36,2.67,18.6,101,2.8,3.24,.3,2.81,5.68,1.03,3.17,1185
+1,14.37,1.95,2.5,16.8,113,3.85,3.49,.24,2.18,7.8,.86,3.45,1480
+1,13.24,2.59,2.87,21,118,2.8,2.69,.39,1.82,4.32,1.04,2.93,735
+1,14.2,1.76,2.45,15.2,112,3.27,3.39,.34,1.97,6.75,1.05,2.85,1450
+1,14.39,1.87,2.45,14.6,96,2.5,2.52,.3,1.98,5.25,1.02,3.58,1290
+1,14.06,2.15,2.61,17.6,121,2.6,2.51,.31,1.25,5.05,1.06,3.58,1295
+1,14.83,1.64,2.17,14,97,2.8,2.98,.29,1.98,5.2,1.08,2.85,1045
+1,13.86,1.35,2.27,16,98,2.98,3.15,.22,1.85,7.22,1.01,3.55,1045
+1,14.1,2.16,2.3,18,105,2.95,3.32,.22,2.38,5.75,1.25,3.17,1510
+1,14.12,1.48,2.32,16.8,95,2.2,2.43,.26,1.57,5,1.17,2.82,1280
+1,13.75,1.73,2.41,16,89,2.6,2.76,.29,1.81,5.6,1.15,2.9,1320
+1,14.75,1.73,2.39,11.4,91,3.1,3.69,.43,2.81,5.4,1.25,2.73,1150
+1,14.38,1.87,2.38,12,102,3.3,3.64,.29,2.96,7.5,1.2,3,1547
+1,13.63,1.81,2.7,17.2,112,2.85,2.91,.3,1.46,7.3,1.28,2.88,1310
+1,14.3,1.92,2.72,20,120,2.8,3.14,.33,1.97,6.2,1.07,2.65,1280
+1,13.83,1.57,2.62,20,115,2.95,3.4,.4,1.72,6.6,1.13,2.57,1130
+1,14.19,1.59,2.48,16.5,108,3.3,3.93,.32,1.86,8.7,1.23,2.82,1680
+1,13.64,3.1,2.56,15.2,116,2.7,3.03,.17,1.66,5.1,.96,3.36,845
+1,14.06,1.63,2.28,16,126,3,3.17,.24,2.1,5.65,1.09,3.71,780
+1,12.93,3.8,2.65,18.6,102,2.41,2.41,.25,1.98,4.5,1.03,3.52,770
+1,13.71,1.86,2.36,16.6,101,2.61,2.88,.27,1.69,3.8,1.11,4,1035
+1,12.85,1.6,2.52,17.8,95,2.48,2.37,.26,1.46,3.93,1.09,3.63,1015
+1,13.5,1.81,2.61,20,96,2.53,2.61,.28,1.66,3.52,1.12,3.82,845
+1,13.05,2.05,3.22,25,124,2.63,2.68,.47,1.92,3.58,1.13,3.2,830
+1,13.39,1.77,2.62,16.1,93,2.85,2.94,.34,1.45,4.8,.92,3.22,1195
+1,13.3,1.72,2.14,17,94,2.4,2.19,.27,1.35,3.95,1.02,2.77,1285
+1,13.87,1.9,2.8,19.4,107,2.95,2.97,.37,1.76,4.5,1.25,3.4,915
+1,14.02,1.68,2.21,16,96,2.65,2.33,.26,1.98,4.7,1.04,3.59,1035
+1,13.73,1.5,2.7,22.5,101,3,3.25,.29,2.38,5.7,1.19,2.71,1285
+1,13.58,1.66,2.36,19.1,106,2.86,3.19,.22,1.95,6.9,1.09,2.88,1515
+1,13.68,1.83,2.36,17.2,104,2.42,2.69,.42,1.97,3.84,1.23,2.87,990
+1,13.76,1.53,2.7,19.5,132,2.95,2.74,.5,1.35,5.4,1.25,3,1235
+1,13.51,1.8,2.65,19,110,2.35,2.53,.29,1.54,4.2,1.1,2.87,1095
+1,13.48,1.81,2.41,20.5,100,2.7,2.98,.26,1.86,5.1,1.04,3.47,920
+1,13.28,1.64,2.84,15.5,110,2.6,2.68,.34,1.36,4.6,1.09,2.78,880
+1,13.05,1.65,2.55,18,98,2.45,2.43,.29,1.44,4.25,1.12,2.51,1105
+1,13.07,1.5,2.1,15.5,98,2.4,2.64,.28,1.37,3.7,1.18,2.69,1020
+1,14.22,3.99,2.51,13.2,128,3,3.04,.2,2.08,5.1,.89,3.53,760
+1,13.56,1.71,2.31,16.2,117,3.15,3.29,.34,2.34,6.13,.95,3.38,795
+1,13.41,3.84,2.12,18.8,90,2.45,2.68,.27,1.48,4.28,.91,3,1035
+1,13.88,1.89,2.59,15,101,3.25,3.56,.17,1.7,5.43,.88,3.56,1095
+1,13.24,3.98,2.29,17.5,103,2.64,2.63,.32,1.66,4.36,.82,3,680
+1,13.05,1.77,2.1,17,107,3,3,.28,2.03,5.04,.88,3.35,885
+1,14.21,4.04,2.44,18.9,111,2.85,2.65,.3,1.25,5.24,.87,3.33,1080
+1,14.38,3.59,2.28,16,102,3.25,3.17,.27,2.19,4.9,1.04,3.44,1065
+1,13.9,1.68,2.12,16,101,3.1,3.39,.21,2.14,6.1,.91,3.33,985
+1,14.1,2.02,2.4,18.8,103,2.75,2.92,.32,2.38,6.2,1.07,2.75,1060
+1,13.94,1.73,2.27,17.4,108,2.88,3.54,.32,2.08,8.90,1.12,3.1,1260
+1,13.05,1.73,2.04,12.4,92,2.72,3.27,.17,2.91,7.2,1.12,2.91,1150
+1,13.83,1.65,2.6,17.2,94,2.45,2.99,.22,2.29,5.6,1.24,3.37,1265
+1,13.82,1.75,2.42,14,111,3.88,3.74,.32,1.87,7.05,1.01,3.26,1190
+1,13.77,1.9,2.68,17.1,115,3,2.79,.39,1.68,6.3,1.13,2.93,1375
+1,13.74,1.67,2.25,16.4,118,2.6,2.9,.21,1.62,5.85,.92,3.2,1060
+1,13.56,1.73,2.46,20.5,116,2.96,2.78,.2,2.45,6.25,.98,3.03,1120
+1,14.22,1.7,2.3,16.3,118,3.2,3,.26,2.03,6.38,.94,3.31,970
+1,13.29,1.97,2.68,16.8,102,3,3.23,.31,1.66,6,1.07,2.84,1270
+1,13.72,1.43,2.5,16.7,108,3.4,3.67,.19,2.04,6.8,.89,2.87,1285
+2,12.37,.94,1.36,10.6,88,1.98,.57,.28,.42,1.95,1.05,1.82,520
+2,12.33,1.1,2.28,16,101,2.05,1.09,.63,.41,3.27,1.25,1.67,680
+2,12.64,1.36,2.02,16.8,100,2.02,1.41,.53,.62,5.75,.98,1.59,450
+2,13.67,1.25,1.92,18,94,2.1,1.79,.32,.73,3.8,1.23,2.46,630
+2,12.37,1.13,2.16,19,87,3.5,3.1,.19,1.87,4.45,1.22,2.87,420
+2,12.17,1.45,2.53,19,104,1.89,1.75,.45,1.03,2.95,1.45,2.23,355
+2,12.37,1.21,2.56,18.1,98,2.42,2.65,.37,2.08,4.6,1.19,2.3,678
+2,13.11,1.01,1.7,15,78,2.98,3.18,.26,2.28,5.3,1.12,3.18,502
+2,12.37,1.17,1.92,19.6,78,2.11,2,.27,1.04,4.68,1.12,3.48,510
+2,13.34,.94,2.36,17,110,2.53,1.3,.55,.42,3.17,1.02,1.93,750
+2,12.21,1.19,1.75,16.8,151,1.85,1.28,.14,2.5,2.85,1.28,3.07,718
+2,12.29,1.61,2.21,20.4,103,1.1,1.02,.37,1.46,3.05,.906,1.82,870
+2,13.86,1.51,2.67,25,86,2.95,2.86,.21,1.87,3.38,1.36,3.16,410
+2,13.49,1.66,2.24,24,87,1.88,1.84,.27,1.03,3.74,.98,2.78,472
+2,12.99,1.67,2.6,30,139,3.3,2.89,.21,1.96,3.35,1.31,3.5,985
+2,11.96,1.09,2.3,21,101,3.38,2.14,.13,1.65,3.21,.99,3.13,886
+2,11.66,1.88,1.92,16,97,1.61,1.57,.34,1.15,3.8,1.23,2.14,428
+2,13.03,.9,1.71,16,86,1.95,2.03,.24,1.46,4.6,1.19,2.48,392
+2,11.84,2.89,2.23,18,112,1.72,1.32,.43,.95,2.65,.96,2.52,500
+2,12.33,.99,1.95,14.8,136,1.9,1.85,.35,2.76,3.4,1.06,2.31,750
+2,12.7,3.87,2.4,23,101,2.83,2.55,.43,1.95,2.57,1.19,3.13,463
+2,12,.92,2,19,86,2.42,2.26,.3,1.43,2.5,1.38,3.12,278
+2,12.72,1.81,2.2,18.8,86,2.2,2.53,.26,1.77,3.9,1.16,3.14,714
+2,12.08,1.13,2.51,24,78,2,1.58,.4,1.4,2.2,1.31,2.72,630
+2,13.05,3.86,2.32,22.5,85,1.65,1.59,.61,1.62,4.8,.84,2.01,515
+2,11.84,.89,2.58,18,94,2.2,2.21,.22,2.35,3.05,.79,3.08,520
+2,12.67,.98,2.24,18,99,2.2,1.94,.3,1.46,2.62,1.23,3.16,450
+2,12.16,1.61,2.31,22.8,90,1.78,1.69,.43,1.56,2.45,1.33,2.26,495
+2,11.65,1.67,2.62,26,88,1.92,1.61,.4,1.34,2.6,1.36,3.21,562
+2,11.64,2.06,2.46,21.6,84,1.95,1.69,.48,1.35,2.8,1,2.75,680
+2,12.08,1.33,2.3,23.6,70,2.2,1.59,.42,1.38,1.74,1.07,3.21,625
+2,12.08,1.83,2.32,18.5,81,1.6,1.5,.52,1.64,2.4,1.08,2.27,480
+2,12,1.51,2.42,22,86,1.45,1.25,.5,1.63,3.6,1.05,2.65,450
+2,12.69,1.53,2.26,20.7,80,1.38,1.46,.58,1.62,3.05,.96,2.06,495
+2,12.29,2.83,2.22,18,88,2.45,2.25,.25,1.99,2.15,1.15,3.3,290
+2,11.62,1.99,2.28,18,98,3.02,2.26,.17,1.35,3.25,1.16,2.96,345
+2,12.47,1.52,2.2,19,162,2.5,2.27,.32,3.28,2.6,1.16,2.63,937
+2,11.81,2.12,2.74,21.5,134,1.6,.99,.14,1.56,2.5,.95,2.26,625
+2,12.29,1.41,1.98,16,85,2.55,2.5,.29,1.77,2.9,1.23,2.74,428
+2,12.37,1.07,2.1,18.5,88,3.52,3.75,.24,1.95,4.5,1.04,2.77,660
+2,12.29,3.17,2.21,18,88,2.85,2.99,.45,2.81,2.3,1.42,2.83,406
+2,12.08,2.08,1.7,17.5,97,2.23,2.17,.26,1.4,3.3,1.27,2.96,710
+2,12.6,1.34,1.9,18.5,88,1.45,1.36,.29,1.35,2.45,1.04,2.77,562
+2,12.34,2.45,2.46,21,98,2.56,2.11,.34,1.31,2.8,.8,3.38,438
+2,11.82,1.72,1.88,19.5,86,2.5,1.64,.37,1.42,2.06,.94,2.44,415
+2,12.51,1.73,1.98,20.5,85,2.2,1.92,.32,1.48,2.94,1.04,3.57,672
+2,12.42,2.55,2.27,22,90,1.68,1.84,.66,1.42,2.7,.86,3.3,315
+2,12.25,1.73,2.12,19,80,1.65,2.03,.37,1.63,3.4,1,3.17,510
+2,12.72,1.75,2.28,22.5,84,1.38,1.76,.48,1.63,3.3,.88,2.42,488
+2,12.22,1.29,1.94,19,92,2.36,2.04,.39,2.08,2.7,.86,3.02,312
+2,11.61,1.35,2.7,20,94,2.74,2.92,.29,2.49,2.65,.96,3.26,680
+2,11.46,3.74,1.82,19.5,107,3.18,2.58,.24,3.58,2.9,.75,2.81,562
+2,12.52,2.43,2.17,21,88,2.55,2.27,.26,1.22,2,.9,2.78,325
+2,11.76,2.68,2.92,20,103,1.75,2.03,.6,1.05,3.8,1.23,2.5,607
+2,11.41,.74,2.5,21,88,2.48,2.01,.42,1.44,3.08,1.1,2.31,434
+2,12.08,1.39,2.5,22.5,84,2.56,2.29,.43,1.04,2.9,.93,3.19,385
+2,11.03,1.51,2.2,21.5,85,2.46,2.17,.52,2.01,1.9,1.71,2.87,407
+2,11.82,1.47,1.99,20.8,86,1.98,1.6,.3,1.53,1.95,.95,3.33,495
+2,12.42,1.61,2.19,22.5,108,2,2.09,.34,1.61,2.06,1.06,2.96,345
+2,12.77,3.43,1.98,16,80,1.63,1.25,.43,.83,3.4,.7,2.12,372
+2,12,3.43,2,19,87,2,1.64,.37,1.87,1.28,.93,3.05,564
+2,11.45,2.4,2.42,20,96,2.9,2.79,.32,1.83,3.25,.8,3.39,625
+2,11.56,2.05,3.23,28.5,119,3.18,5.08,.47,1.87,6,.93,3.69,465
+2,12.42,4.43,2.73,26.5,102,2.2,2.13,.43,1.71,2.08,.92,3.12,365
+2,13.05,5.8,2.13,21.5,86,2.62,2.65,.3,2.01,2.6,.73,3.1,380
+2,11.87,4.31,2.39,21,82,2.86,3.03,.21,2.91,2.8,.75,3.64,380
+2,12.07,2.16,2.17,21,85,2.6,2.65,.37,1.35,2.76,.86,3.28,378
+2,12.43,1.53,2.29,21.5,86,2.74,3.15,.39,1.77,3.94,.69,2.84,352
+2,11.79,2.13,2.78,28.5,92,2.13,2.24,.58,1.76,3,.97,2.44,466
+2,12.37,1.63,2.3,24.5,88,2.22,2.45,.4,1.9,2.12,.89,2.78,342
+2,12.04,4.3,2.38,22,80,2.1,1.75,.42,1.35,2.6,.79,2.57,580
+3,12.86,1.35,2.32,18,122,1.51,1.25,.21,.94,4.1,.76,1.29,630
+3,12.88,2.99,2.4,20,104,1.3,1.22,.24,.83,5.4,.74,1.42,530
+3,12.81,2.31,2.4,24,98,1.15,1.09,.27,.83,5.7,.66,1.36,560
+3,12.7,3.55,2.36,21.5,106,1.7,1.2,.17,.84,5,.78,1.29,600
+3,12.51,1.24,2.25,17.5,85,2,.58,.6,1.25,5.45,.75,1.51,650
+3,12.6,2.46,2.2,18.5,94,1.62,.66,.63,.94,7.1,.73,1.58,695
+3,12.25,4.72,2.54,21,89,1.38,.47,.53,.8,3.85,.75,1.27,720
+3,12.53,5.51,2.64,25,96,1.79,.6,.63,1.1,5,.82,1.69,515
+3,13.49,3.59,2.19,19.5,88,1.62,.48,.58,.88,5.7,.81,1.82,580
+3,12.84,2.96,2.61,24,101,2.32,.6,.53,.81,4.92,.89,2.15,590
+3,12.93,2.81,2.7,21,96,1.54,.5,.53,.75,4.6,.77,2.31,600
+3,13.36,2.56,2.35,20,89,1.4,.5,.37,.64,5.6,.7,2.47,780
+3,13.52,3.17,2.72,23.5,97,1.55,.52,.5,.55,4.35,.89,2.06,520
+3,13.62,4.95,2.35,20,92,2,.8,.47,1.02,4.4,.91,2.05,550
+3,12.25,3.88,2.2,18.5,112,1.38,.78,.29,1.14,8.21,.65,2,855
+3,13.16,3.57,2.15,21,102,1.5,.55,.43,1.3,4,.6,1.68,830
+3,13.88,5.04,2.23,20,80,.98,.34,.4,.68,4.9,.58,1.33,415
+3,12.87,4.61,2.48,21.5,86,1.7,.65,.47,.86,7.65,.54,1.86,625
+3,13.32,3.24,2.38,21.5,92,1.93,.76,.45,1.25,8.42,.55,1.62,650
+3,13.08,3.9,2.36,21.5,113,1.41,1.39,.34,1.14,9.40,.57,1.33,550
+3,13.5,3.12,2.62,24,123,1.4,1.57,.22,1.25,8.60,.59,1.3,500
+3,12.79,2.67,2.48,22,112,1.48,1.36,.24,1.26,10.8,.48,1.47,480
+3,13.11,1.9,2.75,25.5,116,2.2,1.28,.26,1.56,7.1,.61,1.33,425
+3,13.23,3.3,2.28,18.5,98,1.8,.83,.61,1.87,10.52,.56,1.51,675
+3,12.58,1.29,2.1,20,103,1.48,.58,.53,1.4,7.6,.58,1.55,640
+3,13.17,5.19,2.32,22,93,1.74,.63,.61,1.55,7.9,.6,1.48,725
+3,13.84,4.12,2.38,19.5,89,1.8,.83,.48,1.56,9.01,.57,1.64,480
+3,12.45,3.03,2.64,27,97,1.9,.58,.63,1.14,7.5,.67,1.73,880
+3,14.34,1.68,2.7,25,98,2.8,1.31,.53,2.7,13,.57,1.96,660
+3,13.48,1.67,2.64,22.5,89,2.6,1.1,.52,2.29,11.75,.57,1.78,620
+3,12.36,3.83,2.38,21,88,2.3,.92,.5,1.04,7.65,.56,1.58,520
+3,13.69,3.26,2.54,20,107,1.83,.56,.5,.8,5.88,.96,1.82,680
+3,12.85,3.27,2.58,22,106,1.65,.6,.6,.96,5.58,.87,2.11,570
+3,12.96,3.45,2.35,18.5,106,1.39,.7,.4,.94,5.28,.68,1.75,675
+3,13.78,2.76,2.3,22,90,1.35,.68,.41,1.03,9.58,.7,1.68,615
+3,13.73,4.36,2.26,22.5,88,1.28,.47,.52,1.15,6.62,.78,1.75,520
+3,13.45,3.7,2.6,23,111,1.7,.92,.43,1.46,10.68,.85,1.56,695
+3,12.82,3.37,2.3,19.5,88,1.48,.66,.4,.97,10.26,.72,1.75,685
+3,13.58,2.58,2.69,24.5,105,1.55,.84,.39,1.54,8.66,.74,1.8,750
+3,13.4,4.6,2.86,25,112,1.98,.96,.27,1.11,8.5,.67,1.92,630
+3,12.2,3.03,2.32,19,96,1.25,.49,.4,.73,5.5,.66,1.83,510
+3,12.77,2.39,2.28,19.5,86,1.39,.51,.48,.64,9.899999,.57,1.63,470
+3,14.16,2.51,2.48,20,91,1.68,.7,.44,1.24,9.7,.62,1.71,660
+3,13.71,5.65,2.45,20.5,95,1.68,.61,.52,1.06,7.7,.64,1.74,740
+3,13.4,3.91,2.48,23,102,1.8,.75,.43,1.41,7.3,.7,1.56,750
+3,13.27,4.28,2.26,20,120,1.59,.69,.43,1.35,10.2,.59,1.56,835
+3,13.17,2.59,2.37,20,120,1.65,.68,.53,1.46,9.3,.6,1.62,840
+3,14.13,4.1,2.74,24.5,96,2.05,.76,.56,1.35,9.2,.61,1.6,560
@@ -0,0 +1,100 @@
+1. Title of Database: Wine recognition data
+	Updated Sept 21, 1998 by C.Blake : Added attribute information
+
+2. Sources:
+   (a) Forina, M. et al, PARVUS - An Extendible Package for Data
+       Exploration, Classification and Correlation. Institute of Pharmaceutical
+       and Food Analysis and Technologies, Via Brigata Salerno, 
+       16147 Genoa, Italy.
+
+   (b) Stefan Aeberhard, email: stefan@coral.cs.jcu.edu.au
+   (c) July 1991
+3. Past Usage:
+
+   (1)
+   S. Aeberhard, D. Coomans and O. de Vel,
+   Comparison of Classifiers in High Dimensional Settings,
+   Tech. Rep. no. 92-02, (1992), Dept. of Computer Science and Dept. of
+   Mathematics and Statistics, James Cook University of North Queensland.
+   (Also submitted to Technometrics).
+
+   The data was used with many others for comparing various 
+   classifiers. The classes are separable, though only RDA 
+   has achieved 100% correct classification.
+   (RDA : 100%, QDA 99.4%, LDA 98.9%, 1NN 96.1% (z-transformed data))
+   (All results using the leave-one-out technique)
+
+   In a classification context, this is a well posed problem 
+   with "well behaved" class structures. A good data set 
+   for first testing of a new classifier, but not very 
+   challenging.
+
+   (2) 
+   S. Aeberhard, D. Coomans and O. de Vel,
+   "THE CLASSIFICATION PERFORMANCE OF RDA"
+   Tech. Rep. no. 92-01, (1992), Dept. of Computer Science and Dept. of
+   Mathematics and Statistics, James Cook University of North Queensland.
+   (Also submitted to Journal of Chemometrics).
+
+   Here, the data was used to illustrate the superior performance of
+   the use of a new appreciation function with RDA. 
+
+4. Relevant Information:
+
+   -- These data are the results of a chemical analysis of
+      wines grown in the same region in Italy but derived from three
+      different cultivars.
+      The analysis determined the quantities of 13 constituents
+      found in each of the three types of wines. 
+
+   -- I think that the initial data set had around 30 variables, but 
+      for some reason I only have the 13 dimensional version. 
+      I had a list of what the 30 or so variables were, but a.) 
+      I lost it, and b.), I would not know which 13 variables
+      are included in the set.
+
+   -- The attributes are (dontated by Riccardo Leardi, 
+	riclea@anchem.unige.it )
+ 	1) Alcohol
+ 	2) Malic acid
+ 	3) Ash
+	4) Alcalinity of ash  
+ 	5) Magnesium
+	6) Total phenols
+ 	7) Flavanoids
+ 	8) Nonflavanoid phenols
+ 	9) Proanthocyanins
+	10)Color intensity
+ 	11)Hue
+ 	12)OD280/OD315 of diluted wines
+ 	13)Proline            
+
+5. Number of Instances
+
+      	class 1 59
+	class 2 71
+	class 3 48
+
+6. Number of Attributes 
+	
+	13
+
+7. For Each Attribute:
+
+	All attributes are continuous
+	
+	No statistics available, but suggest to standardise
+	variables for certain uses (e.g. for us with classifiers
+	which are NOT scale invariant)
+
+	NOTE: 1st attribute is class identifier (1-3)
+
+8. Missing Attribute Values:
+
+	None
+
+9. Class Distribution: number of instances per class
+
+      	class 1 59
+	class 2 71
+	class 3 48
Author	SHA1	Message	Date
ION606	88f2975b86	added lab 4	2025-10-31 17:55:13 -04:00
ION606	dc2ceac7de	transfer	2025-10-17 09:26:49 -04:00