2. R markdown( 변수별 EDA 및 abline 2가지 사용)

2019. 2. 1. 09:19

데이터 준비 및 확인

library(moonBook)
data(acs) # 급성 심근경색

#View(acs)
dim(acs)

## [1] 857  17

head(acs)

##   age    sex cardiogenicShock   entry              Dx   EF height weight
## 1  62   Male               No Femoral           STEMI 18.0    168     72
## 2  78 Female               No Femoral           STEMI 18.4    148     48
## 3  76 Female              Yes Femoral           STEMI 20.0     NA     NA
## 4  89 Female               No Femoral           STEMI 21.8    165     50
## 5  56   Male               No  Radial          NSTEMI 21.8    162     64
## 6  73 Female               No  Radial Unstable Angina 22.0    153     59
##        BMI obesity  TC LDLC HDLC  TG  DM HBP smoking
## 1 25.51020     Yes 215  154   35 155 Yes  No  Smoker
## 2 21.91381      No  NA   NA   NA 166  No Yes   Never
## 3       NA      No  NA   NA   NA  NA  No Yes   Never
## 4 18.36547      No 121   73   20  89  No  No   Never
## 5 24.38653      No 195  151   36  63 Yes Yes  Smoker
## 6 25.20398     Yes 184  112   38 137 Yes Yes   Never

str(acs)

## 'data.frame':    857 obs. of  17 variables:
##  $ age             : int  62 78 76 89 56 73 58 62 59 71 ...
##  $ sex             : chr  "Male" "Female" "Female" "Female" ...
##  $ cardiogenicShock: chr  "No" "No" "Yes" "No" ...
##  $ entry           : chr  "Femoral" "Femoral" "Femoral" "Femoral" ...
##  $ Dx              : chr  "STEMI" "STEMI" "STEMI" "STEMI" ...
##  $ EF              : num  18 18.4 20 21.8 21.8 22 24.7 26.6 28.5 31.1 ...
##  $ height          : num  168 148 NA 165 162 153 167 160 152 168 ...
##  $ weight          : num  72 48 NA 50 64 59 78 50 67 60 ...
##  $ BMI             : num  25.5 21.9 NA 18.4 24.4 ...
##  $ obesity         : chr  "Yes" "No" "No" "No" ...
##  $ TC              : num  215 NA NA 121 195 184 161 136 239 169 ...
##  $ LDLC            : int  154 NA NA 73 151 112 91 88 161 88 ...
##  $ HDLC            : int  35 NA NA 20 36 38 34 33 34 54 ...
##  $ TG              : int  155 166 NA 89 63 137 196 30 118 141 ...
##  $ DM              : chr  "Yes" "No" "No" "No" ...
##  $ HBP             : chr  "No" "Yes" "Yes" "No" ...
##  $ smoking         : chr  "Smoker" "Never" "Never" "Never" ...

summary(acs)

##       age            sex            cardiogenicShock      entry          
##  Min.   :28.00   Length:857         Length:857         Length:857        
##  1st Qu.:55.00   Class :character   Class :character   Class :character  
##  Median :64.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :63.31                                                           
##  3rd Qu.:72.00                                                           
##  Max.   :91.00                                                           
##                                                                          
##       Dx                  EF            height          weight      
##  Length:857         Min.   :18.00   Min.   :130.0   Min.   : 30.00  
##  Class :character   1st Qu.:50.45   1st Qu.:158.0   1st Qu.: 58.00  
##  Mode  :character   Median :58.10   Median :165.0   Median : 65.00  
##                     Mean   :55.83   Mean   :163.2   Mean   : 64.84  
##                     3rd Qu.:62.35   3rd Qu.:170.0   3rd Qu.: 72.00  
##                     Max.   :79.00   Max.   :185.0   Max.   :112.00  
##                     NA's   :134     NA's   :93      NA's   :91      
##       BMI          obesity                TC             LDLC      
##  Min.   :15.62   Length:857         Min.   : 25.0   Min.   : 15.0  
##  1st Qu.:22.13   Class :character   1st Qu.:154.0   1st Qu.: 88.0  
##  Median :24.16   Mode  :character   Median :183.0   Median :114.0  
##  Mean   :24.28                      Mean   :185.2   Mean   :116.6  
##  3rd Qu.:26.17                      3rd Qu.:213.0   3rd Qu.:141.0  
##  Max.   :41.42                      Max.   :493.0   Max.   :366.0  
##  NA's   :93                         NA's   :23      NA's   :24     
##       HDLC             TG             DM                HBP           
##  Min.   : 4.00   Min.   : 11.0   Length:857         Length:857        
##  1st Qu.:32.00   1st Qu.: 68.0   Class :character   Class :character  
##  Median :38.00   Median :105.5   Mode  :character   Mode  :character  
##  Mean   :38.24   Mean   :125.2                                        
##  3rd Qu.:45.00   3rd Qu.:154.0                                        
##  Max.   :89.00   Max.   :877.0                                        
##  NA's   :23      NA's   :15                                           
##    smoking         
##  Length:857        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

apply(is.na(acs), MARGIN = 2, FUN='sum')

##              age              sex cardiogenicShock            entry 
##                0                0                0                0 
##               Dx               EF           height           weight 
##                0              134               93               91 
##              BMI          obesity               TC             LDLC 
##               93                0               23               24 
##             HDLC               TG               DM              HBP 
##               23               15                0                0 
##          smoking 
##                0

library(mlr)

## Loading required package: ParamHelpers

summarizeColumns(acs)

##                name      type  na      mean        disp    median
## 1               age   integer   0  63.31155 11.69630382  64.00000
## 2               sex character   0        NA  0.33488915        NA
## 3  cardiogenicShock character   0        NA  0.06067678        NA
## 4             entry character   0        NA  0.36406068        NA
## 5                Dx character   0        NA  0.53325554        NA
## 6                EF   numeric 134  55.83444  9.62296368  58.10000
## 7            height   numeric  93 163.17539  9.07930672 165.00000
## 8            weight   numeric  91  64.84373 11.35519114  65.00000
## 9               BMI   numeric  93  24.28190  3.34669608  24.16104
## 10          obesity character   0        NA  0.33838973        NA
## 11               TC   numeric  23 185.20024 47.77292477 183.00000
## 12             LDLC   integer  24 116.58343 41.09484883 114.00000
## 13             HDLC   integer  23  38.23501 11.08668982  38.00000
## 14               TG   integer  15 125.23872 90.85259401 105.50000
## 15               DM character   0        NA  0.35472579        NA
## 16              HBP character   0        NA  0.41540257        NA
## 17          smoking character   0        NA  0.61260210        NA
##          mad      min       max nlevs
## 1  13.343400  28.0000  91.00000     0
## 2         NA 287.0000 570.00000     2
## 3         NA  52.0000 805.00000     2
## 4         NA 312.0000 545.00000     2
## 5         NA 153.0000 400.00000     3
## 6   7.857780  18.0000  79.00000     0
## 7   7.413000 130.0000 185.00000     0
## 8  10.378200  30.0000 112.00000     0
## 9   3.011608  15.6157  41.42012     0
## 10        NA 290.0000 567.00000     2
## 11 42.995400  25.0000 493.00000     0
## 12 40.030200  15.0000 366.00000     0
## 13 10.378200   4.0000  89.00000     0
## 14 60.045300  11.0000 877.00000     0
## 15        NA 304.0000 553.00000     2
## 16        NA 356.0000 501.00000     2
## 17        NA 204.0000 332.00000     3

VIM 패키지의 VIM 라이브러리를 통한 null값 확인

# install.packages("VIM")
library(VIM)

## Loading required package: colorspace

## Loading required package: grid

## Loading required package: data.table

## VIM is ready to use. 
##  Since version 4.0.0 the GUI is in its own package VIMGUI.
## 
##           Please use the package to use the new (and old) GUI.

## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues

## 
## Attaching package: 'VIM'

## The following object is masked from 'package:datasets':
## 
##     sleep

aggr(acs, prop = F, numbers = T)

범주형/숫자형 변수 나누어서 분석해주는 라이브러리 xda

# 범주형/숫자형 변수 확인을 위한 패키지 xda 설치 
#https://github.com/ujjwalkarn/xda
# library(devtools)
# install_github("ujjwalkarn/xda")
# 설치방법 : 1) *관리자 권환으로실행 -> 2) 깃허브인스톨 패키지설치 -> 3) 깃허브인스톨라이브러리로 xda 를 다운 -> 4) xda 라이브러리 사용
# install.packages("githubinstall")
# library(githubinstall)
# githubinstall("xda")
# 사용
# library(xda)
# 데이터셋 자체에 대해서 범주형과 숫자형 변수(칼럼)들 자동분석
# bivariate(iris,'Species','Sepal.Length',n.bins=2)
# bivariate(iris,'Sepal.Length','Species')

범주형 변수 분석 (범주 -> 빈도 / 범주-범주 -> 빈도로 이루어진 교차표 )

table()함수로 범주형 1개(단변수), 범주형 2개(이변수)를 빈도/교차표 생성
카이제곱검정 chisq.test( table = 교차표 )로 범주형변수 2개의 연관성 확인
H0 : 서로 독립이다.

# 1. 범주형 단변수 분석
table(acs$Dx) #  범주별 빈도 확인

## 
##          NSTEMI           STEMI Unstable Angina 
##             153             304             400

prop.table( table(acs$Dx)) # 범주별 상대빈도 확인

## 
##          NSTEMI           STEMI Unstable Angina 
##       0.1785298       0.3547258       0.4667445

barplot( table(acs$Dx) ,
         col = c("orange", "blue", "red") ) # 범주별 빈도 bar chart

# install.packages("qualityTools")
library(qualityTools)

## Loading required package: Rsolnp

## 
## Attaching package: 'Rsolnp'

## The following object is masked from 'package:mlr':
## 
##     benchmark

## Loading required package: MASS

## 
## Attaching package: 'qualityTools'

## The following object is masked from 'package:data.table':
## 
##     cube

## The following object is masked from 'package:stats':
## 
##     sigma

paretoChart(table(acs$Dx))   # 범주별빈도 + 누적합계 bar chart + line

##                                   
## Frequency         400   304    153
## Cum. Frequency    400   704    857
## Percentage      46.7% 35.5%  17.9%
## Cum. Percentage 46.7% 82.1% 100.0%

##                                              
## Frequency       400.00000 304.00000 153.00000
## Cum. Frequency  400.00000 704.00000 857.00000
## Percentage       46.67445  35.47258  17.85298
## Cum. Percentage  46.67445  82.14702 100.00000

paretoChart(table(acs$Dx),
            showTable = F)

##                                   
## Frequency         400   304    153
## Cum. Frequency    400   704    857
## Percentage      46.7% 35.5%  17.9%
## Cum. Percentage 46.7% 82.1% 100.0%

##                                              
## Frequency       400.00000 304.00000 153.00000
## Cum. Frequency  400.00000 704.00000 857.00000
## Percentage       46.67445  35.47258  17.85298
## Cum. Percentage  46.67445  82.14702 100.00000

# 2. 범주형 이변수 분석
table(acs$sex, acs$Dx) # 범주-범주 분석- 범주별 빈도 교차표

##         
##          NSTEMI STEMI Unstable Angina
##   Female     50    84             153
##   Male      103   220             247

tab <-table(acs$sex, acs$Dx)
addmargins(tab) # 범주2개의 교차표에 sum을 표함시킨다.

##         
##          NSTEMI STEMI Unstable Angina Sum
##   Female     50    84             153 287
##   Male      103   220             247 570
##   Sum       153   304             400 857

prop.table(tab) # 전체합에 대한 상대빈도 교차표

##         
##              NSTEMI      STEMI Unstable Angina
##   Female 0.05834306 0.09801634      0.17852975
##   Male   0.12018670 0.25670945      0.28821470

prop.table(tab, 1) # 1= 가로에 있는 범주(sex)에 대한(분모) 상대빈도 교차표

##         
##             NSTEMI     STEMI Unstable Angina
##   Female 0.1742160 0.2926829       0.5331010
##   Male   0.1807018 0.3859649       0.4333333

prop.table(tab, 2) # 2=0=세로에 있는 범주(Dx)에 대한 상대빈도 교차표

##         
##             NSTEMI     STEMI Unstable Angina
##   Female 0.3267974 0.2763158       0.3825000
##   Male   0.6732026 0.7236842       0.6175000

chisq.test( table(acs$sex, acs$Dx)) # 2개 범주의 독립성 확인(H0 : 독립)

## 
##  Pearson's Chi-squared test
## 
## data:  table(acs$sex, acs$Dx)
## X-squared = 8.7983, df = 2, p-value = 0.01229

mycol <- c("skyblue", "blue") # 범주 2개에 대한 색 미리 지정
barplot( table(acs$sex, acs$Dx),  # 앞에 넣은 범주가 범례가 된다.
         col = mycol , main ="Segmented Bar Plot of Gender")
legend("topleft", c("females "," males "), col= mycol, pch = 16)

mycol<-c("skyblue", "blue","darkblue")
barplot(table(acs$Dx, acs$sex), col = mycol, main = "Segmented Bar Plot of Diagnosis")
legend("topleft", c("NSTEMI", "STEMI", "Unstable Angina"), col= mycol, pch = 16, inset =0.05)

숫자형 변수 분석

숫자형 변수는 분포를 확인해야하니 boxplot 이다.
숫자형 변수의 분포 확인2 - 정규분포확인 hist

# 1. 숫자형 단변수
summary(acs$TG)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    11.0    68.0   105.5   125.2   154.0   877.0      15

quantile(acs$TG, na.rm=T)

##    0%   25%   50%   75%  100% 
##  11.0  68.0 105.5 154.0 877.0

IQR(acs$TG, na.rm=T)

## [1] 86

var(acs$TG, na.rm=T)

## [1] 8254.194

sd(acs$TG, na.rm=T)

## [1] 90.85259

stem(acs$TG) # 줄기-잎 플롯

## 
##   The decimal point is 2 digit(s) to the right of the |
## 
##   0 | 12222222233333333333333333333333333333444444444444444444444444444444
##   0 | 55555555555555555555555555555555555555555555556666666666666666666666+220
##   1 | 00000000000000000000000000000000000000000000000111111111111111111111+151
##   1 | 55555555555555555555555555555555555666666666666666666666666666666777+55
##   2 | 000000001111111112222222222222222333344444444444
##   2 | 555556666666667778888899
##   3 | 000111223
##   3 | 5788899
##   4 | 1123
##   4 | 555669
##   5 | 4
##   5 | 58
##   6 | 2
##   6 | 58
##   7 | 
##   7 | 
##   8 | 0
##   8 | 8

#install.packages("mosaic")
library(mosaic)

## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## Loading required package: lattice

## 
## Attaching package: 'lattice'

## The following object is masked from 'package:moonBook':
## 
##     densityplot

## Loading required package: ggformula

## Loading required package: ggplot2

## Loading required package: ggstance

## 
## Attaching package: 'ggstance'

## The following objects are masked from 'package:ggplot2':
## 
##     geom_errorbarh, GeomErrorbarh

## 
## New to ggformula?  Try the tutorials: 
##  learnr::run_tutorial("introduction", package = "ggformula")
##  learnr::run_tutorial("refining", package = "ggformula")

## Loading required package: mosaicData

## Loading required package: Matrix

## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Note: If you use the Matrix package, be sure to load it BEFORE loading mosaic.

## 
## Attaching package: 'mosaic'

## The following object is masked from 'package:Matrix':
## 
##     mean

## The following object is masked from 'package:ggplot2':
## 
##     stat

## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally

## The following object is masked from 'package:qualityTools':
## 
##     dotPlot

## The following object is masked from 'package:mlr':
## 
##     resample

## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median,
##     prop.test, quantile, sd, t.test, var

## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum

dotPlot(acs$TG,pch=19,col=6)        # Dot plot

hist(acs$TG, pch=19, col="lightgray") # 히스토 그램 with 평균, 중앙값
abline(v= mean(acs$TG,na.rm=T), col = " blue")
abline(v= median (acs$TG,na.rm=T), col = "green")
legend("topright", c("Mean", "Median"), pch = c(16,15), col = c("blue", "green"))

par(mfrow=c(1,2))
boxplot(acs$TG, notch = T, col="orange", main="TG (in original scale)")
boxplot(log(acs$TG), notch = T, col="orange", main="TG (in log scale)")

par(mfrow=c(1,1))




# 2. 숫자형 이변수

# 데이터 준비
data("iris")
attach(iris)
# 데이터 확인
library(mlr)
summarizeColumns(iris) # 4가지 숫자형  +  1가지 범주형

##           name    type na     mean      disp median     mad  min  max
## 1 Sepal.Length numeric  0 5.843333 0.8280661   5.80 1.03782  4.3  7.9
## 2  Sepal.Width numeric  0 3.057333 0.4358663   3.00 0.44478  2.0  4.4
## 3 Petal.Length numeric  0 3.758000 1.7652982   4.35 1.85325  1.0  6.9
## 4  Petal.Width numeric  0 1.199333 0.7622377   1.30 1.03782  0.1  2.5
## 5      Species  factor  0       NA 0.6666667     NA      NA 50.0 50.0
##   nlevs
## 1     0
## 2     0
## 3     0
## 4     0
## 5     3

boxplot(Sepal.Length, notch = TRUE, col = "grey") # 숫자형 단변수의 분포 boxplot

boxplot(Sepal.Length~Species, data=iris, notch = TRUE, col = "orange") # 범주별 숫자의 분포boxplot

# 숫자형 변수의 분포 확인2 - hist로 정규분폰지 보기
hist(Sepal.Length, 
     col = "grey", 
     main = "R default", 
     ylab = "Frequency", 
     freq = FALSE)

plot(iris[,-5]) # 산점도 행렬 기본

plot(iris,'Petal.Length') # 특정 숫자칼럼에 대한 산점도 행렬

pairs(iris[,-5], gap = 0)  # 산점도행렬

pairs(iris[,-5], gap = 0, panel = panel.smooth) #산점도행렬 + 회귀라인

# 범주별 - 산점도(숫자-숫자) 행렬
# install.packages("mycor")
library(mycor)

plot(mycor(iris),type=1,groups=Species)

plot(mycor(iris),type=2,groups=Species) #hist추가

plot(mycor(iris),type=3,groups=Species) #상관계수와 fitting curve

abline의 2가지 이용기

1. 숫자-숫자형의 산점도에서 나오는 회귀식 결과 or 절편(a)과 기울기(b)로 회귀선을 그림
1. 수평(h) 혹은 수직(v)선을 추가로 그려줌

 #abline-회귀식에서 나온 계수로 회귀선 그리기
plot (cars , xlim =c(0, 25) )
abline (a=-5, b=3.5 , col ="red ")
abline(lm(dist~speed,data=cars),col="green")

# 추가 수평/수직선 그리기
plot (cars , xlim =c(0, 25) )
abline (a=-5, b=3.5 , col ="red ")
abline (h= mean ( cars $ dist ), lty =2, col =" blue ")
abline (v= mean ( cars $ speed ), lty =2, col =" green ")

저작자표시 (새창열림)

'한의대 생활 > └ 통계에 대한 나의 정리' 카테고리의 다른 글

3. 통계적 추론과 가설검정, p-value (0)	2019.02.05
2-2 R markdown ggplot2 ( plotly 올릴시 에러 ) (0)	2019.02.01
2. 전처리시 체크2가지 및 EDA시 변수의 성격에 따른 분류 (0)	2019.01.25
1. R markdown(데이터경로, 불러오기, 5가지확인, summarizeColumns, mytable, mycsv) (0)	2019.01.25
1. 통계 - 기술통계와 추론통계 , 표본추출방법들 (0)	2019.01.19

우아 한의원 | 조재성 원장의 한의학, 의학통계, 프로그래밍

Menu

Category

Notice

Recent comments

Links

2. R markdown( 변수별 EDA 및 abline 2가지 사용)

EDA(Explanantory Data analysis)

데이터 준비 및 확인

VIM 패키지의 VIM 라이브러리를 통한 null값 확인

범주형/숫자형 변수 나누어서 분석해주는 라이브러리 xda

범주형 변수 분석 (범주 -> 빈도 / 범주-범주 -> 빈도로 이루어진 교차표 )

숫자형 변수 분석

abline의 2가지 이용기

'한의대 생활 > └ 통계에 대한 나의 정리' 카테고리의 다른 글

+ Recent posts

티스토리툴바