---
title: "Lab 2 - Manipulating data"
author: "Tera"
date: "August 2017"
output: html_document
---

```{r setup, include=FALSE}

#if echo=TRUE then the source code is included in the output file, if echo=FALSE then only the output is in the output file

library(knitr)
knitr::opts_chunk$set(echo = T)
```

```{r reading in other data files and some basic functions}

#set the working directory (wd) to the folder that contains the script file, and read in the data
x <- getwd()
setwd(x)

IntCon = read.table("IntCon.csv", header=T, sep=",")  
head(IntCon)

schooldays1 <- read.table("schooldays1.csv", sep=',', header=TRUE)
schooldays2 <- read.table("schooldays2.csv", sep=',', header=TRUE)
head(schooldays1)
head(schooldays2)
```

```{r calculate scale scores}

library(multicon)

#create key for scoring.  Items with negative signs will be reversed scored.
ICkeys.list<-list(IC=c(-1, 2, -3, 4, 5, -6, 7, -8, -9))

#Score the measure and enter the scores in a data called ICout. Rows with less than 80% of the data will be given NA.
ICout <- scoreTest(IntCon[,-1], ICkeys.list, rel=TRUE) # Use [,-1] to ignore column 1, because it isn't part of the measure 
describe(ICout$scores)  # Descriptives for the scale score

#combine scores with original dataset. cbind is a function that merges two data frames by columns.  The columns must line up correctly for this to work, and both data sets must have the same number of rows.
IntCon <- cbind(IntCon, ICout$scores)
head(IntCon)

#output reliabilty information
ICout$rel

#output to file
write.csv(IntCon, "IntCon.scores.csv", row.names=FALSE)
```

```{r sort data}

#sort by a single variable in increasing order 
absent.sorted1inc <- schooldays1[order(schooldays1$absent),]
View(absent.sorted1inc)

#sort by a single variable in decreasing order 
absent.sorted1dec <- schooldays1[order(decreasing=T, schooldays1$absent),]
View(absent.sorted1dec)

#sort by 2 variables
absent.sorted2 <- schooldays1[order(schooldays1$learner, schooldays1$absent), ]
View(absent.sorted2)
```

```{r subset data frames by variables and values}

names(schooldays1)

#keep variables 1 through 3
Absence1 <- schooldays1[1:3]   
names(Absence1)

#keep variables 1 and 6
Absence2 <- schooldays1[c(1, 6)]  
names(Absence2)

#keep the variables listed by name
Absence3 <- schooldays1[c("SID", "race", "absent")]  
names(Absence3) 

#discard variable 2
Absence4 <- schooldays1[-2]    
names(Absence4)

#discard variables 2, 3, and 4
Absence5 <- schooldays1[c(-2, -3, -4)]  
names(Absence5)
```

```{r subsetting data frames with dplyr package and the filter function}

### %>% is called a pipe operator
#load dplyr library to use filter function
library(dplyr)

#keep only females
Female <- schooldays1 %>% filter (schooldays1$gender=="female")
View(Female)

#keep only aboriginal females
Aborg.Female <- schooldays1 %>% filter (schooldays1$gender=="female" & schooldays1$race=="aboriginal")
View(Aborg.Female)

#keep students in levels F0 and F1. The symbol | means or.
F0F1 <- schooldays1 %>% filter (schooldays1$school=="F0" | schooldays1$school=="F1")
View(F0F1)

#keep students who were absent less than 10 days
days10 <- schooldays1 %>% filter (schooldays1$absent < 10)
View(days10)
```

```{r merging data sets}

#Bind by rows, data frames must have the same variables but the variables do not have to be in the same order
schooldays.all <- rbind(schooldays1, schooldays2)
tail(schooldays.all)

#Merge two data frames by identifying one or more common variables. Data frames may have differing numbers of rows, but the final data set will only contain the rows that are in common. 
all.merge <- merge(schooldays.all, IntCon, by="SID")
head(all.merge)
tail(all.merge)

#Merge but keep only some variables
some.merge <- merge(schooldays.all, IntCon[c("SID", "IC")], by="SID")
head(some.merge)
```