library(readr)
library(EnvStats)
library(nortest)

# install.packages(c("readr", "EnvStats"))

# set working directory (relative path)
setwd("~/Desktop/Data Analytics/Lab 1")

pdf("all_plots.pdf", width = 8, height = 6)

# read data
epi.data <- read_csv("epi_results_2024_pop_gdp.csv")

# view dataframe
View(epi.data)

# print summary of variables in dataframe
summary(epi.data$epi_results_2024_pop_gdp.csv.new)

# print values in variable
epi.data$RLI.new

# AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA

# attach dataframe
attach(epi.data)

# print values in variable
RLI.new

########################

### Explore Variable ###

RLI <- epi.data$RLI.new

# find NAs in variable - outputs vector of logical values, true if NA, false otherwise
NAs <- is.na(RLI)

RLI[which(NAs)]

# print values in variable
PHL <- epi.data$PHL.new

PHL

# no NAs
RLI_noNA <- epi.data$RLI.new[!is.na(epi.data$RLI.new)];
PHL_noNA <- epi.data$PHL.new[!is.na(epi.data$PHL.new)];

set.seed(1);
RLI_sub <- sample(RLI_noNA, size = min(180, length(RLI_noNA)));
RLI_new_sub <- RLI_sub;  # only if you truly need a second alias


# find NAs inv variavle - outputs vector of logical values, true if NA, false otherwise
NAs <- is.na(PHL)

# print NAs
PHL[which(NAs)]

# take subset of NOT NAs from variable
PHL.noNA <- PHL[!NAs]

PHL.noNA

# filter for only values above 30
PHL.above30 <- PHL.noNA[PHL.noNA>30]

PHL.above30

# stats
summary(PHL.above30)

# boxplot of variable(s)
boxplot(RLI, PHL.above30, names = c("RHI","PHL"))


### Histograms ###

# histogram (frequency distribution)
# hist(RLI)

# define sequence of values over which to plot histogram
x <- seq(0., 100., 10)
  
# histogram (frequency distribution) over range
hist(RLI, x, breaks=brks, prob=TRUE)

# print estimated density curve for variable
lines(density(RLI, na.rm=TRUE)) # or try bw=“SJ”

# print rug
rug(RLI)

x <- seq(0., 100., 5)

# histogram (frequency distribution) over rabge
hist(RLI, breaks = "FD", prob=TRUE) 

# print estimated density curve for variable
lines(density(RLI, na.rm=TRUE, bw="SJ"))

# print rug
rug(RLI)


# histogram (frequency distribution) over rabge
hist(RLI.new, breaks = "FD", prob=TRUE) 

# range
x1<-seq(5,95,1)

# generate probability density values for a normal distribution with given mean and sd
d1 <- dnorm(x1,mean=45, sd=11,log=FALSE)

# print density values
lines(x1,d1)

# generate probability density values for a normal distribution with given mean and sd
d2 <- dnorm(x1,mean=64, sd=11,log=FALSE)

# print density values
lines(x1,d2) 

# print density values
lines(x1,.5*d2)

### Empirical Cumulative Distribution Function ###

# plot ecdfs
plot(ecdf(RLI), do.points=FALSE, verticals=TRUE) 

plot(ecdf(PHL), do.points=FALSE, verticals=TRUE) 


### Quantile-quantile Plots ###

# print quantile-quantile plot for variable with theoretical normal distribuion
qqnorm(RLI); qqline(RLI)


# print quantile-quantile plot for random numbers from a normal distribution with theoretical normal distribution
x <- rnorm(500)
qqnorm(x); qqline(x)


# print quantile-quantile plot for variable with any theoretical distribution
qqplot(rnorm(180), RLI_sub, xlab = "Q-Q plot for norm dsn") 
qqline(RLI_sub)

# print quantile-quantile plot for 2 variables
qqplot(RLI, PHL, xlab = "Q-Q plot for RHI vs PHL") 

qqplot(x, RLI, xlab = "Q-Q plot for RHI vs PHL") 
qqline(RLI)

y <- rnorm(500)

qqplot(x, y, xlab = "Q-Q plot for RHI vs PHL") 
qqline(y)


## Statistical Tests

x <- rnorm(500)
y <- rnorm(500)

hist(x)
hist(y)

shapiro.test(x)
shapiro.test(y)

ad.test(x)
ad.test(y)

ks.test(x,y)

wilcox.test(x,y)

var.test(x,y)
t.test(x,y)

dev.off()