187 lines
3.5 KiB
R
Executable File
187 lines
3.5 KiB
R
Executable File
library(readr)
|
|
library(EnvStats)
|
|
library(nortest)
|
|
|
|
# install.packages(c("readr", "EnvStats"))
|
|
|
|
# set working directory (relative path)
|
|
setwd("~/Desktop/Data Analytics/Lab 1")
|
|
|
|
pdf("all_plots.pdf", width = 8, height = 6)
|
|
|
|
# read data
|
|
epi.data <- read_csv("epi_results_2024_pop_gdp.csv")
|
|
|
|
# view dataframe
|
|
View(epi.data)
|
|
|
|
# print summary of variables in dataframe
|
|
summary(epi.data$epi_results_2024_pop_gdp.csv.new)
|
|
|
|
# print values in variable
|
|
epi.data$RLI.new
|
|
|
|
# AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
|
|
|
|
# attach dataframe
|
|
attach(epi.data)
|
|
|
|
# print values in variable
|
|
RLI.new
|
|
|
|
########################
|
|
|
|
### Explore Variable ###
|
|
|
|
RLI <- epi.data$RLI.new
|
|
|
|
# find NAs in variable - outputs vector of logical values, true if NA, false otherwise
|
|
NAs <- is.na(RLI)
|
|
|
|
RLI[which(NAs)]
|
|
|
|
# print values in variable
|
|
PHL <- epi.data$PHL.new
|
|
|
|
PHL
|
|
|
|
# no NAs
|
|
RLI_noNA <- epi.data$RLI.new[!is.na(epi.data$RLI.new)];
|
|
PHL_noNA <- epi.data$PHL.new[!is.na(epi.data$PHL.new)];
|
|
|
|
set.seed(1);
|
|
RLI_sub <- sample(RLI_noNA, size = min(180, length(RLI_noNA)));
|
|
RLI_new_sub <- RLI_sub; # only if you truly need a second alias
|
|
|
|
|
|
# find NAs inv variavle - outputs vector of logical values, true if NA, false otherwise
|
|
NAs <- is.na(PHL)
|
|
|
|
# print NAs
|
|
PHL[which(NAs)]
|
|
|
|
# take subset of NOT NAs from variable
|
|
PHL.noNA <- PHL[!NAs]
|
|
|
|
PHL.noNA
|
|
|
|
# filter for only values above 30
|
|
PHL.above30 <- PHL.noNA[PHL.noNA>30]
|
|
|
|
PHL.above30
|
|
|
|
# stats
|
|
summary(PHL.above30)
|
|
|
|
# boxplot of variable(s)
|
|
boxplot(RLI, PHL.above30, names = c("RHI","PHL"))
|
|
|
|
|
|
### Histograms ###
|
|
|
|
# histogram (frequency distribution)
|
|
# hist(RLI)
|
|
|
|
# define sequence of values over which to plot histogram
|
|
x <- seq(0., 100., 10)
|
|
|
|
# histogram (frequency distribution) over range
|
|
hist(RLI, x, breaks=brks, prob=TRUE)
|
|
|
|
# print estimated density curve for variable
|
|
lines(density(RLI, na.rm=TRUE)) # or try bw=“SJ”
|
|
|
|
# print rug
|
|
rug(RLI)
|
|
|
|
x <- seq(0., 100., 5)
|
|
|
|
# histogram (frequency distribution) over rabge
|
|
hist(RLI, breaks = "FD", prob=TRUE)
|
|
|
|
# print estimated density curve for variable
|
|
lines(density(RLI, na.rm=TRUE, bw="SJ"))
|
|
|
|
# print rug
|
|
rug(RLI)
|
|
|
|
|
|
# histogram (frequency distribution) over rabge
|
|
hist(RLI.new, breaks = "FD", prob=TRUE)
|
|
|
|
# range
|
|
x1<-seq(5,95,1)
|
|
|
|
# generate probability density values for a normal distribution with given mean and sd
|
|
d1 <- dnorm(x1,mean=45, sd=11,log=FALSE)
|
|
|
|
# print density values
|
|
lines(x1,d1)
|
|
|
|
# generate probability density values for a normal distribution with given mean and sd
|
|
d2 <- dnorm(x1,mean=64, sd=11,log=FALSE)
|
|
|
|
# print density values
|
|
lines(x1,d2)
|
|
|
|
# print density values
|
|
lines(x1,.5*d2)
|
|
|
|
### Empirical Cumulative Distribution Function ###
|
|
|
|
# plot ecdfs
|
|
plot(ecdf(RLI), do.points=FALSE, verticals=TRUE)
|
|
|
|
plot(ecdf(PHL), do.points=FALSE, verticals=TRUE)
|
|
|
|
|
|
### Quantile-quantile Plots ###
|
|
|
|
# print quantile-quantile plot for variable with theoretical normal distribuion
|
|
qqnorm(RLI); qqline(RLI)
|
|
|
|
|
|
# print quantile-quantile plot for random numbers from a normal distribution with theoretical normal distribution
|
|
x <- rnorm(500)
|
|
qqnorm(x); qqline(x)
|
|
|
|
|
|
# print quantile-quantile plot for variable with any theoretical distribution
|
|
qqplot(rnorm(180), RLI_sub, xlab = "Q-Q plot for norm dsn")
|
|
qqline(RLI_sub)
|
|
|
|
# print quantile-quantile plot for 2 variables
|
|
qqplot(RLI, PHL, xlab = "Q-Q plot for RHI vs PHL")
|
|
|
|
qqplot(x, RLI, xlab = "Q-Q plot for RHI vs PHL")
|
|
qqline(RLI)
|
|
|
|
y <- rnorm(500)
|
|
|
|
qqplot(x, y, xlab = "Q-Q plot for RHI vs PHL")
|
|
qqline(y)
|
|
|
|
|
|
## Statistical Tests
|
|
|
|
x <- rnorm(500)
|
|
y <- rnorm(500)
|
|
|
|
hist(x)
|
|
hist(y)
|
|
|
|
shapiro.test(x)
|
|
shapiro.test(y)
|
|
|
|
ad.test(x)
|
|
ad.test(y)
|
|
|
|
ks.test(x,y)
|
|
|
|
wilcox.test(x,y)
|
|
|
|
var.test(x,y)
|
|
t.test(x,y)
|
|
|
|
dev.off()
|