---
title: "Scatterplots, Covariance, and Correlation"
author: "Tera Letzring"
date: "September 2017"
output: html_document
---

```{r setup, include=FALSE}

knitr::opts_chunk$set(echo = TRUE)
library(car)
library(plyr)
library (psych)
```

```{r read in data}

#set the working directory (wd) to the folder that contains the script file, and read in the data
x <- getwd()
setwd(x)

mydata = read.table("USairpollution.csv", header=T, sep=",")  
attach(mydata)
str(mydata)
```

```{r scatterplots}

#basic scatterplot
plot (temp, precip, xlab="Temperature", ylab="Precipitation")

#basic scatterplot with a regression fit line.  To get this to work, you hav to run both lines of code at the same time.
plot (temp, precip, xlab="Temperature", ylab="Precipitation")
abline(lm(precip ~ temp))

###With enhanced features using the scatterplot function from the car package. 
#legend.title to set the title of the grouping variable
#smoother indicates whether to include a smooth line (default is T=show line, use F to not show)
#reg.line indicates whether to include a regression line (default it T=show line, use F to not show)
#boxplots to add boxplots outside the matrix, for x variable, y variable, or both variables (xy)
#lwd=width of linear regression line (default is 1) 
#jitter makes all data points visible
#xlim and ylim to change the limits of the axes
#legend.coords to specify where the legend for the grouping vaiable should be - options are "bottomright", "bottom", "bottomleft", "left", "topleft", "top", "topright", "right", "center". Default is above the figure on the left
#by.groups=T to get separate regression lines per grouping variable, F to get one line
#grid use F to not show the background grid

scatterplot(precip ~ temp | popSB, data=mydata, xlab="Temperature", ylab="Precipitation", legend.title="Population",  labels=row.names(mydata), smoother=F, boxplots="xy", jitter=list("xy"), xlim=c(40,80), lwd=3, legend.coords = "topleft", by.groups=F)
```

```{r scatterplot matric}

#Basic scatterplot matrix 
scatterplotMatrix(~ SO2 + manu + precip + temp, data=mydata, labels=row.names(mydata))

#Basic scatterplot matrix with grouping variable
scatterplotMatrix(~ SO2 + manu + precip + temp | popSB, data=mydata, labels=row.names(mydata))

###Scatterplot matrix with options. 
#diagnoal: density, boxplot, histogram, qqplot, none (density is the default)
#nclass=number of levels for histogram
#See enhanced features scatterplot above for desciptions of other options
scatterplotMatrix(~ SO2 + manu + precip + temp | popSB, data=mydata, labels=row.names(mydata), var.labels=c("Sulfer Dioxoide", "Manufacturing", "Precipitation", "Temperature"), diagonal=c("histogram"), nclass=6, smoother=F, by.groups=T, lwd=2)
```

```{r covariance and correlation}

#for two variables with 95% CI and p-value 
cov(temp, precip)
cor.test(temp, precip)  

###Calculate covariances and correlations for all variables in a dataframe or matrix. All variables must be numeric. The use option is for telling R how to handle missing data, other options are all.obs (for no missing data) and complete.obs (listwise deletion).  The method option is for telling R the type of correlation to compute, other options are spearman and kendall. 
#create a dataset with only numeric variables
mydata.num <- mydata[-8]
cov(mydata.num, use="pairwise.complete.obs")
cor(mydata.num, use="pairwise.complete.obs", method="pearson")  

###Compute correlations between certain variables (not a whole matrix)
#these variables will be the rows; the numbers indicate variables 1 and 2 from the data set
x <- mydata[1:2]

#these variables will be the columns; the numbers indicate variables 3 through 7
y <- mydata[3:7]

cov(x,y)
cor(x,y)

###Calculate correlation and covariance for different levels of a categorical variable using a function from the plyr package
#ddply = input is a dataframe and output is a dataframe
#popSB is the variable we are splitting by
#temp and precip are the variables being correlated
ddply(mydata, .(popSB), summarise, "covariance" = cov(temp, precip))
ddply(mydata, .(popSB), summarise, "correlation" = cor(temp, precip))
```

```{r compare correlations}

###Compare independent correlations using a function in the psych package

#NULL is needed because there is not a third correlation
#n2= is optional
#(r1, r2, NULL, N)
paired.r(.52, .12, NULL, 60, n2=50)

###Compare dependent correlations
#(r31, r32, r12, N)
paired.r(.20,.45,.15,103)
```

```{r convert correlations}

#r to Fisher Z
fisherz(.52)

#Fisher Z to r
fisherz2r(.576)

#convert r to the effect size d
r2d(.52)

```