library(readr) library(EnvStats) # install.packages(c("readr", "EnvStats")) # set working directory (relative path) # paste function my beloved <3 setwd(paste(getwd(), "Lab 1", sep="/")) # read data epi.data <- read_csv("epi_results_2024_pop_gdp.csv") # view dataframe View(epi.data) # print summary of variables in dataframe summary(epi.data$epi_results_2024_pop_gdp.csv.new) # print values in variable epi.data$RLI.new ######## Optional ######## ## If you want to reference the variable without using the dataframe: # attach dataframe attach(epi.data) # print values in variable RLI.new ######################## ### Explore Variable ### RLI <- epi.data$RLI.new # find NAs in variable - outputs vector of logical values, true if NA, false otherwise NAs <- is.na(RLI) RLI[which(NAs)] # print values in variable PHL <- epi.data$PHL.new PHL # find NAs inv variavle - outputs vector of logical values, true if NA, false otherwise NAs <- is.na(PHL) # print NAs PHL[which(NAs)] # take subset of NOT NAs from variable PHL.noNA <- PHL[!NAs] PHL.noNA # filter for only values above 30 PHL.above30 <- PHL.noNA[PHL.noNA>30] PHL.above30 # stats summary(PHL.above30) # boxplot of variable(s) boxplot(RLI, PHL.above30, names = c("RHI","PHL")) ### Histograms ### # histogram (frequency distribution) # hist(RLI) # define sequence of values over which to plot histogram # I have NO IDEA why this keep breaking but I just started using the range func rng <- range(RLI, na.rm = TRUE) lo <- floor(rng[1] / 5) * 5 hi <- ceiling(rng[2] / 5) * 5 brks <- seq(lo, hi, by = 1) # WHY????? WHY IS IT BREAKING???? # [1] "range 0 lo 0 hi 100 brks 50" "range 97.7 lo 0 hi 100 brks 50" # Error in freq && !equidist : 'length = 15' in coercion to 'logical(1)' # Calls: hist -> hist.default -> plot -> plot.histogram # Execution halted print(paste("range", rng, "lo", lo, "hi", hi, "brks", brks)) hist(RLI, breaks = brks, prob = TRUE) x <- seq(20, 90, by = 5) # histogram (frequency distribution) over range hist(RLI, x, breaks=brks, prob=TRUE) # print estimated density curve for variable lines(density(RLI, na.rm=TRUE)) # or try bw=“SJ” # print rug rug(RLI) x <- seq(5., 95., 5) # histogram (frequency distribution) over rabge hist(RLI, breaks = "FD", prob=TRUE) # print estimated density curve for variable lines(density(RLI, na.rm=TRUE, bw="SJ")) # print rug rug(RLI) # histogram (frequency distribution) over rabge hist(RLI.new, breaks = "FD", prob=TRUE) # range x1<-seq(5,95,1) # generate probability density values for a normal distribution with given mean and sd d1 <- dnorm(x1,mean=45, sd=11,log=FALSE) # print density values lines(x1,d1) # generate probability density values for a normal distribution with given mean and sd d2 <- dnorm(x1,mean=64, sd=11,log=FALSE) # print density values lines(x1,d2) # print density values lines(x1,.5*d2) ### Empirical Cumulative Distribution Function ### # plot ecdfs plot(ecdf(RLI), do.points=FALSE, verticals=TRUE) plot(ecdf(PHL), do.points=FALSE, verticals=TRUE) ### Quantile-quantile Plots ### # print quantile-quantile plot for variable with theoretical normal distribuion qqnorm(RLI); qqline(RLI) # print quantile-quantile plot for random numbers from a normal distribution with theoretical normal distribution x <- rnorm(500) qqnorm(x); qqline(x) # print quantile-quantile plot for variable with any theoretical distribution qqplot(rnorm(180), RLI.sub, xlab = "Q-Q plot for norm dsn") qqline(RLI.sub) # print quantile-quantile plot for 2 variables qqplot(RLI, PHL, xlab = "Q-Q plot for RHI vs PHL") qqplot(x, RLI, xlab = "Q-Q plot for RHI vs PHL") qqline(RLI) y <- rnorm(500) qqplot(x, y, xlab = "Q-Q plot for RHI vs PHL") qqline(y) ## Statistical Tests x <- rnorm(500) y <- rnorm(500) hist(x) hist(y) shapiro.test(x) shapiro.test(y) ad.test(x) ad.test(y) ks.test(x,y) wilcox.test(x,y) var.test(x,y) t.test(x,y)