Untitled

내용정리

    1. 상관관계 : 숫자형-숫자형 변수의 관계, 직선성에 대한 척도, 그래프시 산점도 / 분석시 상관관계
    1. t-test : 범주(요인) 1개 속 2개의 집단(수준, class, 범주의 종류)별 숫자형의 평균차이 검정
    1. anova : 범주(요인) 1개 or 2개 속 3개의 집단(수준, class, 범주의 종류)별 숫자형의 평균차이를 분산분석으로 검정

상관관계

  • moonBook 의 acs데이터
  • 숫자형 hegiht, weight, bmi칼럼만 가져와서 acs2로 만들고 산점도, 히트
# 0. 데이터 준비

library(moonBook)

data(acs)

head(acs)
##   age    sex cardiogenicShock   entry              Dx   EF height weight

## 1  62   Male               No Femoral           STEMI 18.0    168     72

## 2  78 Female               No Femoral           STEMI 18.4    148     48

## 3  76 Female              Yes Femoral           STEMI 20.0     NA     NA

## 4  89 Female               No Femoral           STEMI 21.8    165     50

## 5  56   Male               No  Radial          NSTEMI 21.8    162     64

## 6  73 Female               No  Radial Unstable Angina 22.0    153     59

##        BMI obesity  TC LDLC HDLC  TG  DM HBP smoking

## 1 25.51020     Yes 215  154   35 155 Yes  No  Smoker

## 2 21.91381      No  NA   NA   NA 166  No Yes   Never

## 3       NA      No  NA   NA   NA  NA  No Yes   Never

## 4 18.36547      No 121   73   20  89  No  No   Never

## 5 24.38653      No 195  151   36  63 Yes Yes  Smoker

## 6 25.20398     Yes 184  112   38 137 Yes Yes   Never
# 1. 숫자형 필요칼럼만 뽑기

acs2 <- acs[, c("height", "weight", "BMI")]



# 2. 상관계수 행렬

cor( acs2, use = "na.or.complete")
##              height    weight          BMI

## height  1.000000000 0.6315767 -0.009049596

## weight  0.631576661 1.0000000  0.762441135

## BMI    -0.009049596 0.7624411  1.000000000
# 3. 산점도+상관계수 행렬 pairplot 1

library(psych)

pairs.panels(acs2)

# 4. 산점도+상관계수 행렬 pairplot 2

library(PerformanceAnalytics)
## Loading required package: xts
## Loading required package: zoo
## 

## Attaching package: 'zoo'
## The following objects are masked from 'package:base':

## 

##     as.Date, as.Date.numeric
## 

## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':

## 

##     legend
chart.Correlation(acs2, histogram = TRUE, pch = 19)

# 5. 상관계수 heatmap

library(corrplot)
## corrplot 0.84 loaded
corrplot(cor(acs2,use="na.or.complete"))

t-test

  • 독립표본 : sex(요인, 범주)의 2집단(수준, class)별 BMI의 평균 차이 검정
  • 보정된t-test : age까지 요인으로 추가
#### 1. 독립표본 t-test

# 데이터 준비

# 1. 독립성 생략

# 2. 정규성 생략

# 3. 등분산성 검정 :  집단별로 BMI를 인덱싱한 뒤, 등분산성 체크

var.test(acs[ acs$sex == "Male", ]$BMI, acs[acs$sex == "Female",]$BMI)
## 

##  F test to compare two variances

## 

## data:  acs[acs$sex == "Male", ]$BMI and acs[acs$sex == "Female", ]$BMI

## F = 0.82798, num df = 508, denom df = 254, p-value = 0.07756

## alternative hypothesis: true ratio of variances is not equal to 1

## 95 percent confidence interval:

##  0.6663069 1.0210005

## sample estimates:

## ratio of variances 

##          0.8279795
# 4. 독립표본 t-test

t.test(BMI ~ sex, data = acs, var.equal = T)
## 

##  Two Sample t-test

## 

## data:  BMI by sex

## t = -0.50823, df = 762, p-value = 0.6114

## alternative hypothesis: true difference in means is not equal to 0

## 95 percent confidence interval:

##  -0.6348532  0.3737344

## sample estimates:

## mean in group Female   mean in group Male 

##             24.19492             24.32548
#### 2. 보정된 t-test 

summary(aov(BMI ~ age + sex, data = acs)) # 기존 sex가 +뒤에 있으며, 이 것의 p-value 보는 것이 목적
##              Df Sum Sq Mean Sq F value   Pr(>F)    

## age           1    386   385.8  36.097 2.91e-09 ***

## sex           1     26    25.9   2.427     0.12    

## Residuals   761   8134    10.7                     

## ---

## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## 93 observations deleted due to missingness
#### 3. 대응표본 t-test

# 데이터 준비 : 같은 길이를 가진 데이터야한다.

Before_score = c(80,30,50,40,40,10,20,30,60,70)

After_score = c(95,40,60,45,40,15,20,25,67,90)



t.test(Before_score, After_score, paired=T)
## 

##  Paired t-test

## 

## data:  Before_score and After_score

## t = -2.8423, df = 9, p-value = 0.01933

## alternative hypothesis: true difference in means is not equal to 0

## 95 percent confidence interval:

##  -12.032489  -1.367511

## sample estimates:

## mean of the differences 

##                    -6.7

ANOVA

  • 1way : smoking요인(범주)의 3집단(수준, class)별 TG의 평균차이 검정
  • 2way : Dx, smoking
#### 1. 1way anova

# 1.숫자형으로 되어있다면 범주형으로 타입 변경 생략

# 2. 범주별 숫자의 boxplot, mean, sd 생략

# 3. 등분산성 검정 생략

# 4. anova

aov <- aov(TG ~ smoking, data= acs)

summary(aov)
##              Df  Sum Sq Mean Sq F value Pr(>F)  

## smoking       2   65695   32848   4.008 0.0185 *

## Residuals   839 6876082    8196                 

## ---

## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## 15 observations deleted due to missingness
# 5. 사후분석 Duncan

library(laercio)

LDuncan(aov, "group")
## 

##  DUNCAN TEST TO COMPARE MEANS 

##  

##  Confidence Level:  0.95 

##  Dependent Variable:  TG

##  Variation Coefficient:  72.28546 % 

##  

## 

##  Independent Variable:  smoking 

##   Factors   Means              

##   Smoker    136.490566037736 a 

##   Never     119.496913580247  b

##   Ex-smoker 116.65            b
#### 2. 2way anova

# 1. 2way anova

aov2 <- aov( TG ~ smoking + Dx, data = acs)

summary(aov2) # 두 요인 모두 유의
##              Df  Sum Sq Mean Sq F value   Pr(>F)    

## smoking       2   65695   32848   4.119   0.0166 *  

## Dx            2  201756  100878  12.651 3.87e-06 ***

## Residuals   837 6674326    7974                     

## ---

## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## 15 observations deleted due to missingness
aov2 <- aov( TG ~ Dx + smoking , data = acs)

summary(aov2) # 두 요인 모두 유의 but 수준차이가 난다...
##              Df  Sum Sq Mean Sq F value   Pr(>F)    

## Dx            2  166365   83182  10.432 3.35e-05 ***

## smoking       2  101087   50543   6.338  0.00185 ** 

## Residuals   837 6674326    7974                     

## ---

## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## 15 observations deleted due to missingness
# 2. interactioplot

par(mfrow=c(1,2))

mean.rm.na <- function(x) mean(x,na.rm=T) # interaction.plot에 들어가는 집계함수를 직접 정의***

interaction.plot(acs$smoking,acs$Dx, acs$TG,fun=mean.rm.na)

interaction.plot(acs$Dx,acs$smoking, acs$TG,fun=mean.rm.na)

+ Recent posts