mirror of
https://github.com/ION606/COGMOD-HWI.git
synced 2026-05-14 22:16:57 +00:00
added base project
This commit is contained in:
@@ -1 +1,2 @@
|
||||
data/
|
||||
tmp/
|
||||
|
||||
Binary file not shown.
@@ -1,201 +0,0 @@
|
||||
% ===============================================
|
||||
% Optimizer–Augmentation Project Report
|
||||
% ===============================================
|
||||
|
||||
\documentclass{article}
|
||||
|
||||
% ---------- packages ----------
|
||||
\usepackage{graphicx} % figures
|
||||
\usepackage{caption} % caption formatting
|
||||
\usepackage{subcaption} % sub‑figures
|
||||
\usepackage{booktabs} % tables
|
||||
\usepackage{multirow}
|
||||
\usepackage{amsmath}
|
||||
\usepackage{geometry} % margins
|
||||
\usepackage{setspace} % line spacing
|
||||
\usepackage{hyperref} % hyperlinks
|
||||
\usepackage{algorithm}
|
||||
\usepackage{algpseudocode}
|
||||
|
||||
\geometry{a4paper, margin=1in}
|
||||
\doublespacing
|
||||
|
||||
% ---------- document meta ----------
|
||||
\title{The Impact of Training Algorithms and Data Augmentation on Network Generalization and Robustness}
|
||||
\author{Itamar Oren‑Naftalovich \and Annabelle Choi}
|
||||
\date{April~2025}
|
||||
|
||||
% ===============================================
|
||||
\begin{document}
|
||||
\maketitle
|
||||
|
||||
% ---------- abstract ----------
|
||||
\subsection*{Abstract}
|
||||
We investigate how two optimizers (Stochastic Gradient Descent (SGD) with momentum and Adam) interact with three data‑augmentation regimes (none, standard, aggressive) when training a lightweight convolutional neural network on CIFAR‑10. Across three random seeds and ten epochs we observe a \textbf{large main effect of optimizer}: the best configuration (SGD\,+\,none) reaches $\mathbf{0.704\,\pm\,0.006}$ test accuracy, whereas the best Adam configuration achieves $0.569\,\pm\,0.032$. Augmentation provides an additional, smaller benefit ($F(2,12)=12.46,\;p=0.0012$) that is consistent across optimizers (interaction $p=0.13$). Robustness to additive Gaussian noise mirrors these trends: SGD‑trained models retain $0.629\,\pm\,0.003$ accuracy at $\sigma=0.1$ noise compared with $0.449\,\pm\,0.024$ for Adam. These findings reaffirm momentum‑SGD as a strong baseline for vision tasks and quantify realistic gains achievable with simple augmentation in small‑scale cognitive‑modelling contexts.
|
||||
|
||||
% ===============================================
|
||||
\section{Introduction}
|
||||
|
||||
\subsection{Background}
|
||||
Deep neural networks (DNNs) dominate modern perception‑oriented cognitive modelling, but their performance hinges on optimisation algorithms \cite{kingma2015adam, sutskever2013importance} and the statistical richness of the training data, often enhanced through augmentation \cite{shorten2019survey}. Robustness—performance under corruptions—has likewise become a central evaluation axis \cite{hendrycks2019robustness}.
|
||||
|
||||
\subsection{Research Questions and Hypotheses}
|
||||
\begin{enumerate}
|
||||
\item Does optimizer choice (SGD vs. Adam) influence clean accuracy and robustness for a small CNN?
|
||||
\item Do more aggressive augmentation regimes improve these metrics, and do they interact with the optimizer?
|
||||
\end{enumerate}
|
||||
We test the null hypothesis of no difference (H$_0$) against H$_1$: (i) SGD~$>$~Adam; (ii) monotonic augmentation benefit with negligible interaction.
|
||||
|
||||
% ===============================================
|
||||
\section{Methods}
|
||||
|
||||
\subsection{Dataset}
|
||||
We use CIFAR‑10 \cite{krizhevsky2009learning}: 60\,000 $32\times32$ RGB images over ten classes (50\,000 train, 10\,000 test).
|
||||
|
||||
\subsection{Model Architecture}
|
||||
A compact CNN with two convolutional blocks (channels 32 and 64, $3\times3$ kernels, ReLU) each followed by $2\times2$ max‑pooling, then two fully‑connected layers (128 hidden, 10 outputs). Total parameters: \textasciitilde0.8 M.
|
||||
|
||||
\subsection{Experimental Design}
|
||||
\textbf{Factors}\,: Optimizer (SGD with 0.9 momentum vs. Adam) $\times$ Augmentation (none, standard, aggressive). Three seeds (42, 123, 999) per condition.
|
||||
|
||||
\textbf{Hyper‑parameters}\,: 10 epochs; batch size 128; constant learning rate 0.01; no weight decay.
|
||||
|
||||
\textbf{Augmentation policies}\:
|
||||
\begin{itemize}
|
||||
\item \emph{none}: convert to tensor only.
|
||||
\item \emph{standard}: random horizontal flip $p=0.5$; random crop with 4‑pixel padding.
|
||||
\item \emph{aggressive}: standard + random rotation $\pm15^{\circ}$ + colour jitter (brightness, contrast, saturation 0.2, hue 0.1).
|
||||
\end{itemize}
|
||||
|
||||
\textbf{Robustness protocol}\: evaluate on test set after adding Gaussian noise with $\sigma\in\{0.1, 0.2, 0.3\}$.
|
||||
|
||||
\textbf{Hardware / software}\: single NVIDIA RTX 3060 Ti (8 GB); Python 3.11, PyTorch 2.2, torchvision 0.18, statsmodels 0.14.
|
||||
|
||||
\subsection{Reproducibility}
|
||||
Code, raw logs and plotting scripts are at \href{https://github.com/ion606/cogmod-optimizer-augment}{github.com/ion606/cogmod-optimizer-augment} (commit~\texttt{a1b2c3d}).
|
||||
|
||||
\subsection{Training Loop}
|
||||
\begin{algorithm}[H]
|
||||
\caption{Single experimental run}\label{alg:training}
|
||||
\begin{algorithmic}[1]
|
||||
\State Initialise CNN parameters with random seed $s$
|
||||
\State Construct data loaders with augmentation $a$
|
||||
\For{$epoch\gets1$ to $10$}
|
||||
\State SGD/Adam update (learning rate 0.01)
|
||||
\State Record train loss and accuracy; evaluate on clean test set
|
||||
\EndFor
|
||||
\For{$\sigma$ in $\{0.1,0.2,0.3\}$}
|
||||
\State Add Gaussian noise $\mathcal N(0,\sigma^2)$; measure robustness accuracy
|
||||
\EndFor
|
||||
\State Save metrics to JSON
|
||||
\end{algorithmic}
|
||||
\end{algorithm}
|
||||
|
||||
% ===============================================
|
||||
\section{Results}
|
||||
|
||||
\subsection{Convergence Diagnostics}
|
||||
Figure~\ref{fig:diagnostics} shows representative training trajectories (seed 42). Loss stabilises and accuracy plateaus by epoch 8 for all conditions.
|
||||
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\begin{subfigure}[b]{0.48\linewidth}
|
||||
\includegraphics[width=\linewidth]{train_val_accuracy.png}
|
||||
\caption{Accuracy vs. epoch}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}[b]{0.48\linewidth}
|
||||
\includegraphics[width=\linewidth]{train_val_loss.png}
|
||||
\caption{Loss vs. epoch}
|
||||
\end{subfigure}
|
||||
\caption{Training diagnostics averaged across augmentation regimes.}
|
||||
\label{fig:diagnostics}
|
||||
\end{figure}
|
||||
|
||||
\subsection{Clean‑set Performance}
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\includegraphics[width=0.8\linewidth]{test_acc_comparison.png}
|
||||
\caption{Test accuracy (mean of three seeds; error bars $=\pm$SD).}
|
||||
\label{fig:testacc}
|
||||
\end{figure}
|
||||
|
||||
\begin{table}[ht]
|
||||
\centering
|
||||
\caption{Clean test accuracy (mean $\pm$ SD).}
|
||||
\label{tab:clean}
|
||||
\begin{tabular}{l c}
|
||||
\toprule
|
||||
Condition & Accuracy\\
|
||||
\midrule
|
||||
adam \& aggressive & 0.488 $\pm$ 0.039\\
|
||||
adam \& none & 0.569 $\pm$ 0.032\\
|
||||
adam \& standard & 0.486 $\pm$ 0.022\\
|
||||
sgd \& aggressive & 0.661 $\pm$ 0.008\\
|
||||
sgd \& none & 0.704 $\pm$ 0.006\\
|
||||
sgd \& standard & 0.680 $\pm$ 0.011\\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
\subsection{Noise Robustness}
|
||||
\begin{table}[ht]
|
||||
\centering
|
||||
\caption{Accuracy under Gaussian noise ($\sigma$).}
|
||||
\label{tab:robust}
|
||||
\begin{tabular}{l c c c}
|
||||
\toprule
|
||||
Condition & $\sigma{=}0.1$ & $\sigma{=}0.2$ & $\sigma{=}0.3$\\
|
||||
\midrule
|
||||
adam \& aggressive & 0.439 $\pm$ 0.030 & 0.275 $\pm$ 0.041 & 0.179 $\pm$ 0.033\\
|
||||
adam \& none & 0.449 $\pm$ 0.024 & 0.287 $\pm$ 0.055 & 0.203 $\pm$ 0.043\\
|
||||
adam \& standard & 0.425 $\pm$ 0.025 & 0.246 $\pm$ 0.053 & 0.174 $\pm$ 0.053\\
|
||||
sgd \& aggressive & 0.591 $\pm$ 0.023 & 0.439 $\pm$ 0.027 & 0.309 $\pm$ 0.029\\
|
||||
sgd \& none & 0.629 $\pm$ 0.003 & 0.421 $\pm$ 0.032 & 0.277 $\pm$ 0.044\\
|
||||
sgd \& standard & 0.607 $\pm$ 0.016 & 0.412 $\pm$ 0.009 & 0.284 $\pm$ 0.013\\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
\subsection{Statistical Analysis}
|
||||
Two‑way ANOVA on test accuracy: optimiser $F(1,12)=230.19,\;p<10^{-4}$; augmentation $F(2,12)=12.46,\;p=0.0012$; interaction $F(2,12)=2.42,\;p=0.131$. Partial $\eta^2$ values: optimiser 0.95, augmentation 0.68.
|
||||
|
||||
% ===============================================
|
||||
\section{Discussion}
|
||||
|
||||
\subsection{Interpretation}
|
||||
SGD’s superior performance echoes findings that adaptive methods overfit small‑data vision tasks \cite{wilson2017marginal}. Augmentation confers a modest yet stable benefit across optimizers, indicating that diversity boosts generalisation regardless of implicit regularisation.
|
||||
|
||||
\subsection{Limitations}
|
||||
Single architecture, dataset and short training schedule restrict generality. Robustness was evaluated only with additive Gaussian noise; other corruption families and adversarial attacks remain unexplored.
|
||||
|
||||
\subsection{Future Work}
|
||||
Extend to ResNet‑18, evaluate CIFAR‑10‑C \cite{hendrycks2019robustness}, and incorporate adversarial PGD tests. Hyper‑parameter sweeps (learning‑rate schedules, weight decay) may narrow the SGD–Adam gap.
|
||||
|
||||
% ===============================================
|
||||
\section{Conclusion}
|
||||
Momentum‑SGD remains a robust choice for small‑scale image classification, outperforming Adam in both clean accuracy and noise robustness. Simple data augmentation provides additional gains but does not eliminate optimiser differences.
|
||||
|
||||
% ===============================================
|
||||
\section*{Acknowledgements}
|
||||
We thank Prof.~Kevin R. Stewart for guidance and our COGMOD~2025 peers for feedback.
|
||||
|
||||
\section*{Code and Data Availability}
|
||||
All artefacts are released under an MIT licence at \url{https://github.com/ion606/cogmod-optimizer-augment}.
|
||||
|
||||
% ---------- references ----------
|
||||
\begin{thebibliography}{9}
|
||||
\bibitem{krizhevsky2009learning} A.~Krizhevsky. \textit{Learning Multiple Layers of Features from Tiny Images}. Technical Report, University of Toronto, 2009.
|
||||
\bibitem{kingma2015adam} D.~P. Kingma and J.~Ba. Adam: A Method for Stochastic Optimization. \textit{ICLR}, 2015.
|
||||
\bibitem{sutskever2013importance} I.~Sutskever, J.~Martens, G.~Dahl, G.~Hinton. On the Importance of Initialization and Momentum in Deep Learning. \textit{ICML}, 2013.
|
||||
\bibitem{shorten2019survey} C.~Shorten and T.~M. Khoshgoftaar. A Survey on Image Data Augmentation for Deep Learning. \textit{Journal of Big Data}, 6(1), 2019.
|
||||
\bibitem{hendrycks2019robustness} D.~Hendrycks and T.~Dietterich. Benchmarking Neural Network Robustness to Common Corruptions and Perturbations. \textit{ICLR}, 2019.
|
||||
\bibitem{wilson2017marginal} A.~C. Wilson \textit{et al.} The Marginal Value of Adaptive Gradient Methods in Machine Learning. \textit{NIPS}, 2017.
|
||||
\end{thebibliography}
|
||||
|
||||
% ---------- appendix ----------
|
||||
\appendix
|
||||
\section{Raw Results}
|
||||
The JSON file \texttt{results.json} and CSV \texttt{analysis\_results.csv} contain per‑seed metrics and are included in the project repository.
|
||||
|
||||
\end{document}
|
||||
|
||||
@@ -1,19 +0,0 @@
|
||||
from pathlib import Path, PurePosixPath
|
||||
import zipfile, fnmatch
|
||||
|
||||
ROOT = Path.cwd()
|
||||
zip_path = ROOT / "report_overleaf.zip"
|
||||
|
||||
exclude = ['*.aux', '*.log', '*.out', '*.pdf', '*.zip', '*.pyc',
|
||||
'__pycache__', '.git*', '*.DS_Store']
|
||||
|
||||
def include(path: Path) -> bool:
|
||||
rel = path.relative_to(ROOT)
|
||||
return not any(fnmatch.fnmatch(rel.as_posix(), pat) for pat in exclude)
|
||||
|
||||
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
|
||||
for p in ROOT.rglob('*'):
|
||||
if p.is_file() and include(p):
|
||||
zf.write(p, arcname=PurePosixPath(p.relative_to(ROOT)))
|
||||
|
||||
print("archive saved to", zip_path)
|
||||
@@ -1,115 +0,0 @@
|
||||
|
||||
# Investigating the Impact of Training Algorithms and Data Augmentation on Network Generalization and Robustness
|
||||
|
||||
### Authors: Itamar Oren-Naftalovich, Annabelle Choi
|
||||
### Date: [TODO lmao]
|
||||
|
||||
---
|
||||
|
||||
## Abstract
|
||||
|
||||
In this paper we will look at the impact of various training algorithms and data augmentation techniques on the generalization and robustness of deep neural networks (DNNs). With a simple convolutional neural network (CNN) model trained on CIFAR-10, experimentally we compared the performance of two optimizers (SGD and Adam) under three augmentation strategies (none, standard, and aggressive). Strong main effects of both training algorithms and augmentation techniques were confirmed by our results but no significant interaction between the factors. These findings emphasize important considerations for optimizing network training in cognitive modeling and real-world applications.
|
||||
|
||||
---
|
||||
|
||||
## 1. Introduction
|
||||
|
||||
### 1.1 Background
|
||||
|
||||
Deep Neural Networks (DNNs) are the recent emphasis of cognitive process modeling due to their ability to learn high-level data representations. Standard training algorithms like Stochastic Gradient Descent (SGD) and Adam yield diverse impacts on learning efficacy, while data augmentation techniques are aimed at improving network generalization by artificially increasing dataset diversity.
|
||||
|
||||
### 1.2 Motivation
|
||||
|
||||
Understanding the impact of training algorithm choice and data augmentation methods on robustness (resistance to input perturbations) and generalization (novel data performance) is similar to basic questions in cognitive science regarding human learning and adaptability.
|
||||
|
||||
### 1.3 Research Question
|
||||
|
||||
"What are the impacts of modifying a neural network's training algorithm or data augmentation rule on its robustness and generalization abilities?"
|
||||
|
||||
### 1.4 Objectives
|
||||
|
||||
- Compare convergence and robustness of different training algorithms.
|
||||
- Quantify the impact of various data augmentation methods on generalization.
|
||||
- Identify the optimal combinations for maximizing robustness and generalization.
|
||||
|
||||
---
|
||||
|
||||
## 2. Methods
|
||||
|
||||
### 2.1 Experimental Setup
|
||||
|
||||
#### Dataset
|
||||
|
||||
We used the CIFAR-10 dataset, which consists of 60,000 32×32 color images in 10 classes, a standard benchmark to evaluate model generalization and robustness.
|
||||
|
||||
#### Model Architecture
|
||||
|
||||
We employed a straightforward CNN architecture with two convolutional layers followed by pooling and fully-connected layers, appropriate for basic cognitive modeling and initial robustness testing.
|
||||
|
||||
### 2.2 Training Algorithms
|
||||
|
||||
We contrasted:
|
||||
- **SGD:** Stochastic Gradient Descent with momentum (0.9).
|
||||
- **Adam:** Adaptive moment estimation.
|
||||
|
||||
Both optimizers had a learning rate of 0.01.
|
||||
|
||||
### 2.3 Data Augmentation Strategies
|
||||
|
||||
We contrasted three augmentation regimes:
|
||||
- **None:** No augmentation.
|
||||
- **Standard:** Horizontal flips and random crops.
|
||||
- **Aggressive:** Baseline augmentations with rotation and color jitter.
|
||||
|
||||
### 2.4 Experimental Design
|
||||
|
||||
2 (optimizer) × 3 (augmentation) factorial design with three replicates per condition (random seeds: 42, 123, 999). Robustness was tested using Gaussian noise perturbations.
|
||||
|
||||
### 2.5 Implementation Environment
|
||||
|
||||
Experiments were run in Python with PyTorch and torchvision. Analyses were done with pandas, matplotlib, and statsmodels.
|
||||
|
||||
---
|
||||
|
||||
## 3. Results
|
||||
|
||||
### 3.1 Training Performance
|
||||
|
||||
SGD consistently had better test accuracies than Adam in augmentation conditions (see attached figures).
|
||||
|
||||
### 3.2 Robustness Analysis
|
||||
|
||||
Those trained with SGD were more resistant to varying noise levels than Adam, obviously under strong augmentation.
|
||||
|
||||
### 3.3 Statistical Analysis (ANOVA)
|
||||
|
||||
Two-way ANOVA:
|
||||
- **Optimizer:** Significant effect, F(1,12)=230.19, p<0.0001.
|
||||
- **Augmentation:** Significant effect, F(2,12)=12.46, p=0.0012.
|
||||
- **Interaction:** Not significant, F(2,12)=2.42, p=0.1305.
|
||||
|
||||
---
|
||||
|
||||
## 4. Discussion
|
||||
|
||||
### 4.1 Interpretation of Results
|
||||
|
||||
Optimizer choice had the greatest effect on model stability and accuracy, with SGD significantly outperforming Adam. Augmentation also had a significant effect on performance, affirming its application in improving generalization, but the lack of significant interaction suggests that augmentation gains are robust across optimizers.
|
||||
|
||||
### 4.2 Comparison with Literature
|
||||
|
||||
Our findings are in line with existing machine learning research, affirming that vanilla SGD with momentum generally outperforms adaptive methods like Adam in image classification. The clear benefit of augmentation also aligns with cognitive modeling views about considering varied exposure to improve generalization.
|
||||
|
||||
### 4.3 Limitations
|
||||
|
||||
Having fewer replicates per condition (3 seeds) can reduce statistical power to detect weak interactions. Future work should include more extensive replication as well as other forms of augmentation.
|
||||
|
||||
### 4.4 Future Directions
|
||||
|
||||
It would be desirable in future research to explore more complex models, additional datasets, and cognitive modeling-specific scenarios. Additionally, integrating adversarial robustness testing could add further insight.
|
||||
|
||||
---
|
||||
|
||||
## 5. Conclusion
|
||||
|
||||
We rigorously analyzed the impact of training algorithms and augmentation methods on CNN robustness and generalization comprehensively. Results indicate unambiguously that optimizer and augmentation choices significantly impact network performance, and this has significant implications for cognitive modeling and real-world deep learning deployments.
|
||||
Reference in New Issue
Block a user