R Notebook

칼럼명 확인과 변경

# 데이터 준비
iris2 <- iris
head(iris2)
ABCDEFGHIJ0123456789
 
 
Sepal.Length
<dbl>
Sepal.Width
<dbl>
Petal.Length
<dbl>
Petal.Width
<dbl>
Species
<fctr>
15.13.51.40.2setosa
24.93.01.40.2setosa
34.73.21.30.2setosa
44.63.11.50.2setosa
55.03.61.40.2setosa
65.43.91.70.4setosa
# 칼럼명확인
names(iris2)
## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width" 
## [5] "Species"
# 칼럼명 변경
names(iris2) <- c( "first", "second", "third", "fourth", "fifth")
names(iris2)
## [1] "first"  "second" "third"  "fourth" "fifth"

사용자 정의함수로 ( 반복적인 작업인 ) scaling 해보기

  • 정규분포에서 random한 숫자 10를 뽑아 4개의 칼럼 만들기
  • 그 값들을 scaling을 통해 0~1사이 값으로 만들기
  • x - min(x) / max(x) - min(x) => 분모가 가장 크므로 항상 1보다 작다.
# tibble을 활용한 데이터프레임 만들기
# library(tibble)
# 라이브러리 적용과 동시에 해당 함수 사용하기 package :: function
df <- tibble::tibble(
  a = rnorm(10),
  b = rnorm(10),
  c = rnorm(10),
  d = rnorm(10)
)
head(df)
ABCDEFGHIJ0123456789
a
<dbl>
b
<dbl>
c
<dbl>
d
<dbl>
-0.023722710.8371705-0.2382807-0.3827532
-1.56250914-1.4957210-0.22323201.7938363
0.777936820.3878949-1.1817327-1.0509536
0.44403743-0.18612370.15237520.3055854
-0.82957491-1.55134542.16830960.8935408
1.882382460.8386947-1.18926270.3027364
summary(df)
##        a                 b                  c                  d          
##  Min.   :-1.5702   Min.   :-1.55135   Min.   :-1.97816   Min.   :-1.0510  
##  1st Qu.:-0.6281   1st Qu.:-0.52394   1st Qu.:-1.05749   1st Qu.:-0.2120  
##  Median : 0.4213   Median : 0.21181   Median :-0.23076   Median : 0.3042  
##  Mean   : 0.1132   Mean   : 0.06085   Mean   :-0.03817   Mean   : 0.3449  
##  3rd Qu.: 0.7566   3rd Qu.: 0.83831   3rd Qu.: 1.02052   3rd Qu.: 0.7530  
##  Max.   : 1.8824   Max.   : 1.32477   Max.   : 2.16831   Max.   : 1.7938
# scaling은 각 칼럼마다 해줘야하므로 반복적이다.
(df$a - min(df$a, na.rm = T))  /  (max(df$a, na.rm = T) -  min(df$a, na.rm = T))
##  [1] 0.447926340 0.002240828 0.680114530 0.583405776 0.214523807
##  [6] 1.000000000 0.570234402 0.000000000 0.721868086 0.655403845
df$a <- (df$a - min(df$a, na.rm = T))  /  (max(df$a, na.rm = T) -  min(df$a, na.rm = T))
df$b <- (df$b - min(df$b, na.rm = T))  /  (max(df$b, na.rm = T) -  min(df$b, na.rm = T))
df$c <- (df$c - min(df$c, na.rm = T))  /  (max(df$c, na.rm = T) -  min(df$c, na.rm = T))
df$d <- (df$d - min(df$d, na.rm = T))  /  (max(df$d, na.rm = T) -  min(df$d, na.rm = T))

head(df)
ABCDEFGHIJ0123456789
a
<dbl>
b
<dbl>
c
<dbl>
d
<dbl>
0.4479263400.830466230.41960440.2348857
0.0022408280.019340110.42323371.0000000
0.6801145300.674257010.19207280.0000000
0.5834057760.474675740.51381860.4768503
0.2145238070.000000001.00000000.6835283
1.0000000000.830996190.19025680.4758489
# range()함수는 [1]에는 최소값 [2]에는 최대값을 내포한다.
rng <- range( rnorm(10) , na.rm = T)
rng[1] # 최소값
## [1] -0.7202099
rng[2] # 최대값
## [1] 1.028419
# 사용자 정의함수로 scaling 정의하기

rescale <- function (x){
  rng <- range(x, na.rm = T)
  ( x- rng[1] ) / ( rng[2] - rng[1])
  
}

#테스트해보기
rescale(df$a)
##  [1] 0.447926340 0.002240828 0.680114530 0.583405776 0.214523807
##  [6] 1.000000000 0.570234402 0.000000000 0.721868086 0.655403845
rescale( c(0, 5, 10) )
## [1] 0.0 0.5 1.0
# 각 칼럼마다 적용해보기
df$a <- rescale(df$a)
df$b <- rescale(df$b)
df$c <- rescale(df$c)
df$d <- rescale(df$d)

head(df)
ABCDEFGHIJ0123456789
a
<dbl>
b
<dbl>
c
<dbl>
d
<dbl>
0.4479263400.830466230.41960440.2348857
0.0022408280.019340110.42323371.0000000
0.6801145300.674257010.19207280.0000000
0.5834057760.474675740.51381860.4768503
0.2145238070.000000001.00000000.6835283
1.0000000000.830996190.19025680.4758489

+ Recent posts