- R 소개
- 기초 함수 및 변수 다루기
- 데이터 핸들링
김형준
Data Analyst
S-PLUS -> R (1997, GNU PROJECT)
한계 : R’s biggest challenge is that most R users are not programmers
극복 : Solving the “Two-Language” Problem
예) R Studio
func | func | ||
---|---|---|---|
+ - * / ^ | 단순계산 | ||
sqrt(x) | 제곱근 | ceiling(x) | 올림 |
floor(x) | 내림 | abs(x) | 절댓값 |
trunc(x) | 버림 | log(x) | 자연로그 |
round(x) | 반올림 | round(x,n) | 소수점 n 까지 반올림 |
factorial(n) | n! | choose(n,k) | nCk |
예)
factorial(3)
[1] 6
round(25.233,1)
[1] 25.2
head(installed.packages())
## Package
## abind "abind"
## acepack "acepack"
## acss.data "acss.data"
## ACTCD "ACTCD"
## ada "ada"
## adabag "adabag"
## LibPath
## abind "/Library/Frameworks/R.framework/Versions/3.1/Resources/library"
## acepack "/Library/Frameworks/R.framework/Versions/3.1/Resources/library"
## acss.data "/Library/Frameworks/R.framework/Versions/3.1/Resources/library"
## ACTCD "/Library/Frameworks/R.framework/Versions/3.1/Resources/library"
## ada "/Library/Frameworks/R.framework/Versions/3.1/Resources/library"
## adabag "/Library/Frameworks/R.framework/Versions/3.1/Resources/library"
## Version Priority Depends Imports
## abind "1.4-0" NA "R (>= 1.5.0)" NA
## acepack "1.3-3.3" NA NA NA
## acss.data "1.0" NA "R (>= 2.10)" NA
## ACTCD "1.0-0" NA "R (>= 2.15.1), R.methodsS3" NA
## ada "2.0-3" NA "R(>= 2.10),rpart" NA
## adabag "3.2" NA "rpart, mlbench, caret" NA
## LinkingTo Suggests Enhances License License_is_FOSS
## abind NA NA NA "LGPL (>= 2)" NA
## acepack NA NA NA "MIT + file LICENSE" NA
## acss.data NA NA NA "GPL (>= 2)" NA
## ACTCD NA NA NA "GPL (>= 2)" NA
## ada NA NA NA "GPL" NA
## adabag NA NA NA "GPL (>= 2)" NA
## License_restricts_use OS_type MD5sum NeedsCompilation Built
## abind NA NA NA NA "3.1.1"
## acepack NA NA NA "yes" "3.1.0"
## acss.data NA NA NA "no" "3.1.0"
## ACTCD NA NA NA "yes" "3.1.1"
## ada NA NA NA NA "3.1.0"
## adabag NA NA NA "no" "3.1.0"
install.packages("dplyr")
##
## The downloaded binary packages are in
## /var/folders/_r/ytdlmf5x039_5bzh37xy121r0000gn/T//RtmpypwiTe/downloaded_packages
install.packages("dplyr", repos="http://cran.rstudio.com/")
RSiteSearch("sna")
??sna
c(T,F,T)
[1] TRUE FALSE TRUE
is.numeric(3);is.integer(3)
[1] TRUE [1] FALSE
is.character("Kim"); is.character(c("Kim",3,5)); is.numeric(c("Kim",3,5))
[1] TRUE [1] TRUE [1] FALSE
\[ \sqrt{2} \times \sqrt{2} \ne 2 \]
sqrt(2) * sqrt(2) == 2
[1] FALSE
3.55555666662314214
[1] 3.556
3.55555666662314214 == 3.556; 3.55555666662314214 == 3.55555666662314214
[1] FALSE [1] TRUE
options(digits=5);3.55555666662314214
[1] 3.5556
options(digits=10);3.55555666662314214
[1] 3.555556667
x = matrix(NA, nrow=10, ncol=3)
x
## [,1] [,2] [,3]
## [1,] NA NA NA
## [2,] NA NA NA
## [3,] NA NA NA
## [4,] NA NA NA
## [5,] NA NA NA
## [6,] NA NA NA
## [7,] NA NA NA
## [8,] NA NA NA
## [9,] NA NA NA
## [10,] NA NA NA
x[1,3]=3; x[2,]=c(1,5,7); x[3,]=seq(1,3); x[4,]=1:3; x[5,]=rep(1,3)
x[6,]=sep(5,1,by=-2);
x[is.na(x)] = 0
x
## [,1] [,2] [,3]
## [1,] 0 0 3
## [2,] 1 5 7
## [3,] 1 2 3
## [4,] 1 2 3
## [5,] 1 1 1
## [6,] 5 3 1
## [7,] 0 0 0
## [8,] 0 0 0
## [9,] 0 0 0
## [10,] 0 0 0
data_ex = data.frame(height=c(168,182,175),weight=c(50,60,70),sex=c("여","남","남"))
data_ex
## height weight sex
## 1 168 50 여
## 2 182 60 남
## 3 175 70 남
data_ex$married = c(T,F,T)
data_ex[,"bmi"] = data_ex[,"height"]/data_ex[,"weight"]
data_ex
## height weight sex married bmi
## 1 168 50 여 TRUE 3.360000000
## 2 182 60 남 FALSE 3.033333333
## 3 175 70 남 TRUE 2.500000000
attach (data_ex);height
## [1] 168 182 175
detach(data_ex);height
## Error: object 'height' not found
with(data_ex, height);subset(data_ex, height>180)
## [1] 168 182 175
## height weight sex married bmi
## 2 182 60 남 FALSE 3.033333333
library("dplyr")
data_ex%>%filter(height>180)
## height weight sex married bmi
## 1 182 60 남 FALSE 3.033333333
e.x) 미국 휴스턴에서 출발하는 모든 비행기의 2011년 이착륙기록
library("hflights")
library("hflights")
dim(hflights)
## [1] 227496 21
library("hflights")
#hflights
#head(hflights)
hflights[1:3,1:5]
## Year Month DayofMonth DayOfWeek DepTime
## 5424 2011 1 1 6 1400
## 5425 2011 1 2 7 1401
## 5426 2011 1 3 1 1352
hflights_df = tbl_df(hflights)
hflights_df
## Source: local data frame [227,496 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011 1 1 6 1400 1500 AA
## 5425 2011 1 2 7 1401 1501 AA
## 5426 2011 1 3 1 1352 1502 AA
## 5427 2011 1 4 2 1403 1513 AA
## 5428 2011 1 5 3 1405 1507 AA
## 5429 2011 1 6 4 1359 1503 AA
## 5430 2011 1 7 5 1359 1509 AA
## 5431 2011 1 8 6 1355 1454 AA
## 5432 2011 1 9 7 1443 1554 AA
## 5433 2011 1 10 1 1443 1553 AA
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
mean(hflights[,"DepTime"]);mean(hflights[,"DepTime"],na.rm=T)
hflights_df_rep<-rbind(hflights_df,hflights_df,hflights_df)
system.time(mean(as.data.frame(hflights_df_rep)[,"DepTime"],na.rm=T))
## user system elapsed
## 0.022 0.001 0.023
system.time(hflights_df_rep%>%select(DepTime)%>%dplyr::summarise(mean(DepTime,na.rm=T)))
## user system elapsed
## 0.007 0.001 0.011
system.time(rbind(hflights_df,hflights_df,hflights_df))
## user system elapsed
## 1.221 0.183 1.452
library("data.table")
##
## Attaching package: 'data.table'
##
## The following object is masked from 'package:hash':
##
## copy
##
## The following objects are masked from 'package:h2o':
##
## month, year
##
## The following object is masked from 'package:dplyr':
##
## last
system.time(rbindlist(list(hflights_df,hflights_df,hflights_df)))
## user system elapsed
## 0.034 0.007 0.048
suppressMessages(library(dplyr))
library(hflights)
flights <- tbl_df(hflights)
flights %>% filter(Month==1, DayofMonth==1)
## Source: local data frame [552 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 1 6 728 840 AA 460
## 3 2011 1 1 6 1631 1736 AA 1121
## 4 2011 1 1 6 1756 2112 AA 1294
## 5 2011 1 1 6 1012 1347 AA 1700
## 6 2011 1 1 6 1211 1325 AA 1820
## 7 2011 1 1 6 557 906 AA 1994
## 8 2011 1 1 6 1824 2106 AS 731
## 9 2011 1 1 6 654 1124 B6 620
## 10 2011 1 1 6 1639 2110 B6 622
## .. ... ... ... ... ... ... ... ...
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
## (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
## Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
flights %>% select(DepTime, ArrTime, FlightNum)
flights %>%
select(UniqueCarrier, DepDelay) %>%
arrange(DepDelay) # arrange(desc(DepDelay))
## Source: local data frame [227,496 x 2]
##
## UniqueCarrier DepDelay
## 1 OO -33
## 2 MQ -23
## 3 XE -19
## 4 XE -19
## 5 CO -18
## 6 EV -18
## 7 XE -17
## 8 CO -17
## 9 XE -17
## 10 MQ -17
## .. ... ...
flights %>%
group_by(Dest) %>%
summarise(avg_delay = mean(ArrDelay, na.rm=TRUE))%>%slice(1:2)
## Error: could not find function "slice"
a = flights %>%
group_by(UniqueCarrier) %>%
summarise(mean_Cancelled = mean(Cancelled),mean_Diverted = mean(Diverted))
a%>%slice(1:3)
## Error: could not find function "slice"
b = flights %>%
group_by(UniqueCarrier) %>%
summarise_each(funs(mean), Cancelled, Diverted)
colnames(a) = colnames(b)
all.equal(a,b)
## [1] TRUE
planes<- group_by(flights, TailNum)
delay<-summarise(planes, count=n(), m_dist = mean(Distance,na.rm=T),
m_delay = mean(ArrDelay,na.rm=T))%>%filter(m_dist > 0, m_delay > 0, count < 800)
library(ggplot2)
ggplot(delay, aes(m_dist, m_delay)) + geom_point(aes(size = count)) + geom_smooth()
plot(delay[,"m_dist"][[1]],delay[,"m_delay"][[1]],xlab="m_delay",ylab="m_delay",cex=delay[,"count"][[1]]/300)
lines(smooth.spline(delay[,"m_dist"][[1]],delay[,"m_delay"][[1]]),col="red")
fread_multi <- function(allFiles){
mc <- detectCores();
cl <- makeCluster(mc);
DT <- parLapply(cl,allFiles,fun=fread,
header=T,
stringsAsFactors=F,
colClasses=c("character","numeric","character",rep("numeric",48)))
#call fread on each core (well... using each core at least)
stopCluster(cl);
DT <- rbindlist(DT);
return(DT);
}
path_dir = "/Users/kimhyungjun/Dropbox/R_lecture/mydeck/r_slide/"
write.csv(delay, file=paste(path_dir,"delay.csv",sep=""),fileEncoding="CP949",
row.names=FALSE)
header, stringAsFactors, colClasses, fileEncoding, row.names
delay_rep = read.csv(file=paste(path_dir,"delay.csv",sep=""),fileEncoding="CP949")
all.equal(as.data.frame(delay), delay_rep)
delay_rep = read.csv(file=paste(path_dir,"delay.csv",sep=""),fileEncoding="CP949",
stringsAsFactors = F)
all.equal(as.data.frame(delay), delay_rep)
\[ parallel > lapply \simeq foreach\]