This repository has been archived on 2026-05-09. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
Data-Analytics/Lab 1/Lab1_code_sample.R
T
2025-09-14 11:25:41 -04:00

194 lines
3.8 KiB
R
Executable File

library(readr)
library(EnvStats)
# install.packages(c("readr", "EnvStats"))
# set working directory (relative path)
# paste function my beloved <3
setwd(paste(getwd(), "Lab 1", sep="/"))
# read data
epi.data <- read_csv("epi_results_2024_pop_gdp.csv")
# view dataframe
View(epi.data)
# print summary of variables in dataframe
summary(epi.data$epi_results_2024_pop_gdp.csv.new)
# print values in variable
epi.data$RLI.new
######## Optional ########
## If you want to reference the variable without using the dataframe:
# attach dataframe
attach(epi.data)
# print values in variable
RLI.new
########################
### Explore Variable ###
RLI <- epi.data$RLI.new
# find NAs in variable - outputs vector of logical values, true if NA, false otherwise
NAs <- is.na(RLI)
RLI[which(NAs)]
# print values in variable
PHL <- epi.data$PHL.new
PHL
# find NAs inv variavle - outputs vector of logical values, true if NA, false otherwise
NAs <- is.na(PHL)
# print NAs
PHL[which(NAs)]
# take subset of NOT NAs from variable
PHL.noNA <- PHL[!NAs]
PHL.noNA
# filter for only values above 30
PHL.above30 <- PHL.noNA[PHL.noNA>30]
PHL.above30
# stats
summary(PHL.above30)
# boxplot of variable(s)
boxplot(RLI, PHL.above30, names = c("RHI","PHL"))
### Histograms ###
# histogram (frequency distribution)
# hist(RLI)
# define sequence of values over which to plot histogram
# I have NO IDEA why this keep breaking but I just started using the range func
rng <- range(RLI, na.rm = TRUE)
lo <- floor(rng[1] / 5) * 5
hi <- ceiling(rng[2] / 5) * 5
brks <- seq(lo, hi, by = 1)
# WHY????? WHY IS IT BREAKING????
# [1] "range 0 lo 0 hi 100 brks 50" "range 97.7 lo 0 hi 100 brks 50"
# Error in freq && !equidist : 'length = 15' in coercion to 'logical(1)'
# Calls: hist -> hist.default -> plot -> plot.histogram
# Execution halted
print(paste("range", rng, "lo", lo, "hi", hi, "brks", brks))
hist(RLI,
breaks = brks,
prob = TRUE)
x <- seq(20, 90, by = 5)
# histogram (frequency distribution) over range
hist(RLI, x, breaks=brks, prob=TRUE)
# print estimated density curve for variable
lines(density(RLI, na.rm=TRUE)) # or try bw=“SJ”
# print rug
rug(RLI)
x <- seq(5., 95., 5)
# histogram (frequency distribution) over rabge
hist(RLI, breaks = "FD", prob=TRUE)
# print estimated density curve for variable
lines(density(RLI, na.rm=TRUE, bw="SJ"))
# print rug
rug(RLI)
# histogram (frequency distribution) over rabge
hist(RLI.new, breaks = "FD", prob=TRUE)
# range
x1<-seq(5,95,1)
# generate probability density values for a normal distribution with given mean and sd
d1 <- dnorm(x1,mean=45, sd=11,log=FALSE)
# print density values
lines(x1,d1)
# generate probability density values for a normal distribution with given mean and sd
d2 <- dnorm(x1,mean=64, sd=11,log=FALSE)
# print density values
lines(x1,d2)
# print density values
lines(x1,.5*d2)
### Empirical Cumulative Distribution Function ###
# plot ecdfs
plot(ecdf(RLI), do.points=FALSE, verticals=TRUE)
plot(ecdf(PHL), do.points=FALSE, verticals=TRUE)
### Quantile-quantile Plots ###
# print quantile-quantile plot for variable with theoretical normal distribuion
qqnorm(RLI); qqline(RLI)
# print quantile-quantile plot for random numbers from a normal distribution with theoretical normal distribution
x <- rnorm(500)
qqnorm(x); qqline(x)
# print quantile-quantile plot for variable with any theoretical distribution
qqplot(rnorm(180), RLI.sub, xlab = "Q-Q plot for norm dsn")
qqline(RLI.sub)
# print quantile-quantile plot for 2 variables
qqplot(RLI, PHL, xlab = "Q-Q plot for RHI vs PHL")
qqplot(x, RLI, xlab = "Q-Q plot for RHI vs PHL")
qqline(RLI)
y <- rnorm(500)
qqplot(x, y, xlab = "Q-Q plot for RHI vs PHL")
qqline(y)
## Statistical Tests
x <- rnorm(500)
y <- rnorm(500)
hist(x)
hist(y)
shapiro.test(x)
shapiro.test(y)
ad.test(x)
ad.test(y)
ks.test(x,y)
wilcox.test(x,y)
var.test(x,y)
t.test(x,y)