---
title: "Descriptive statistics"
author: "Tera Letzring"
date: "September 2017"
output: html_document
---

```{r setup, include=FALSE}

knitr::opts_chunk$set(echo = TRUE)
library(psych)
```

```{r set wd and read in data}

#set the working directory (wd) to the folder that contains the script file, and read in the data
x <- getwd()
setwd(x)

SDdata = read.table("schooldays.csv", header=T, sep=",")  
head(SDdata)
```

```{r descriptive statistics}

#attach data set so you only have to refer to the variable names
attach (SDdata)

#calculate basic descriptives
table(absent)         #frequency distribution
mean(absent)
median(absent)
sd(absent)            #standard deviation
var(absent)           #variance
range(absent)         #returns the minimum and maximum scores
quantile(absent)      #all quantiles: 0, 25, 50, 75, 100
IQR(absent)           #interquartile range
summary(SDdata)       #mean, median, 25th and 75th quartiels, min, and max for numeric variables frequency counts for factor variables  
describe(SDdata)      #This command is from the psych package, so you need to have that loaded in order to use it. Results include item name, item number, nvalid, mean, sd, median, trimmed median, mad (median absolute deviation), min, max, range, skew, kurtosis, standard error.  Statistics are provided for non-numeric variables, but an asteric is by the variable name.

#calculate mode
temp <- table(as.vector(absent))   #create a sorted list of all unique values
names(temp)[temp == max(temp)]     #returns the names of the values that have the highest count in temp
  
#describe data by one grouping variable. This command is in the psych package.
describeBy(absent, gender, mat=TRUE)

#describe data by two grouping variable.  
describeBy(absent, list(gender,learner), mat=TRUE)

#calculate means for all combinations of levels of the variables learner, race, and school
tapply(absent, list(learner=learner, race=race, school.type=school), mean, na.rm=TRUE)

#create a dataset with the means
new.means <- tapply(SDdata$absent, list(learner=SDdata$learner, 
  race=SDdata$race, school=SDdata$school), mean, na.rm=TRUE)

#output to .csv file
write.csv(new.means, "new.means.csv", row.names=TRUE)
```