데이터 개념 이해하기¶

벡터 만들기¶

a1 <- c(1,2,3,4,5)
a1

is(a1)

a2 <- c(1L,2L,3L)
a2

is(a2)

a3 <- as.integer(a1)
a3

is(a3)

b <- c(1.23, 3.14, 6.66)
b

c1 <- c('a','b','c','a')
c1

is(c1)

c2 <- c(1,2,3,'z')
c2

is(c2)

# 명목형 벡터로 변환
d1 <- as.factor(c1)
d1

is(d1)

data frame 만들기¶

벡터들의 갯수가 같아야 한다

a1 <- c(1,2,3,4,5)
b1 <- c('a','b','c','d','e')
c1 <- c(1.1, 2.2, 3.3, 4.4, 5.5)

df1 <- data.frame(a1, b1, c1)
df1

# 데이터 프레임 이름 설정하기
df2 <- data.frame(count=a1, name=b1,meanCount=c1)
df2

외부 데이터 가져오기 예제¶

# csv 파일 불러오기
df_csv <- read.csv('R-ggagi-data/example_studentlist.csv')
# df_csv <- read.csv('R-ggagi-data/example_studentlist.csv', header=F)

df_csv

# 컬럼 뽑아오기
df_csv$height

#벡터 확인
is.vector(df_csv$height)

# 데이터 프레임 구조 파악
str(df_csv)

'data.frame':	17 obs. of  8 variables:
 $ name     : Factor w/ 17 levels "강수친","김길동",..: 2 12 17 6 10 7 1 14 13 9 ...
 $ sex      : Factor w/ 2 levels "남자","여자": 1 2 1 1 2 2 2 2 1 1 ...
 $ age      : int  23 22 24 23 20 21 22 23 23 22 ...
 $ grade    : int  3 2 4 3 1 2 1 1 3 2 ...
 $ absence  : Factor w/ 2 levels "무","유": 2 1 1 1 2 1 1 1 1 1 ...
 $ bloodtype: Factor w/ 4 levels "A","AB","B","O": 4 2 3 2 1 4 4 1 3 3 ...
 $ height   : num  165 170 175 182 168 ...
 $ weight   : num  68.2 53 80.1 85.7 49.5 52 45.3 55 64.2 61.3 ...

# 변수선택 방법 1  (컬럼명)
df_csv$height

# 변수선택 방법 2  (인덱스-R은 인덱싱 1부터)
df_csv[[7]]

# 변수선택 방법 3 
df_csv[7]

# 벡터와 데이터프레임 차이를 파악해두기
is(df_csv$height)
is(df_csv[[7]])
is(df_csv[7])

# 여러개의 변수 선택 1 / c 사용
df_csv[c(6,7)]

# 여러개의 변수 선택 2
df_csv[c('bloodtype','height')]

# 행, 열 방식으로 가져오기
# 행은 다 가져오고, 열은 7번째 가져와!
df_csv[,7]

df_csv[2,]

df_csv[2,1]

df_raw2 = df_csv[2,]

df_raw2[1] # 데이터가 아닌 데이터 프레임을 가져옴

df_raw2[[1]] # 데이터만 가져옴

# 데이터만 가져오기
df_csv <- read.csv('R-ggagi-data/example_studentlist.csv', stringsAsFactors = FALSE)

df_csv[2,1]

# 검색 목록
search()

# 검색목록에 추가
attach(df_csv)

# 변수명으로 바로 사용 가능
height

search()

# 검색목록에서 삭제
detach(df_csv)

search()

subset() : 조건으로 변수 선택¶

# 조건으로 변수 선택 subset()
# 키가 170보다 큰 관측치 - subset =
subset(df_csv, subset = (height >170))

select : 변수 선택해서 취하거나 버리기¶

# 특정변수 빼고 보기 - select = 
subset(df_csv, select = -height)

# 특정변수 여러개 빼고 보기 - select = 
subset(df_csv, select = c(-height, -weight))

colname() : 변수명 바꾸기¶

# 변수명 확인하기
colnames(df_csv)

# 변수명 바꾸기
colnames(df_csv)[6] <- 'blood'  # 6번째 변수명 수정
df_csv

# 모든 변수명 바꾸기

df_csv <- read.csv('R-ggagi-data/example_studentlist.csv')

old_list <- colnames(df_csv)
new_list <- c('na','se','ag','gr','ab','bl','he','we')
colnames(df_csv) <- new_list
head(df_csv)
colnames(df_csv) <- old_list
head(df_csv)

cbind() : 새로운 변수 추가¶

bmi <- df_csv$weight/df_csv$height^2
bmi

df_bmi <- cbind(df_csv,bmi)
df_bmi

merge() : 2개의 데이터프레임 합치기¶

df_1 <- read.csv('R-ggagi-data//example_studentlist.csv')
df_2 <- read.csv('R-ggagi-data//omit.csv')
df_1
df_2

df_merge <- merge(df_1, df_2, by='name')
df_merge

rbind() : 행으로 추가하기¶

같은 이름의 열을 가지고 있어야 함

df_head <- head(df_merge)
df_tail <- tail(df_merge)
df_head
df_tail

df_rbind <- rbind(df_head, df_tail)
df_rbind

list : 모든 종류의 데이터 객체 담기¶

df <- df_merge
n <- c(1:20)
s <- c('a','b','c')
b <- c(T,F,T,F,T)
tmp_list1 <- list(df,n,s,b,mean)
tmp_list1

function (x, ...) 
UseMethod("mean")

# 이름 넣어서 만들기
tmp_list2 <- list(DataFrame=df, Number=n, String=s, Bool=b, Func=mean)
tmp_list2

function (x, ...) 
UseMethod("mean")

# 항목 삭제
tmp_list1[1] <- NULL
tmp_list1

function (x, ...) 
UseMethod("mean")

# 항목 선택
tmp_list2[2]
tmp_list2['Number']

class() : 자료형 확인¶

함수에 인자로 사용 시 주의

class(tmp_list1[1])
class(tmp_list1[[1]])

class(tmp_list2[1])
class(tmp_list2[[1]])

# 여러 개 항목 선택

# list
tmp_list2[c(2,3)]

# list
tmp_list2[c('Number','String')]

# integer
tmp_list2$Number

# character
tmp_list2$String

split()¶

# 성별에 따른 키값을 리스트로 반환
height_sex <- split(df$height, df$sex) # 뒷 값에 따른 앞의 값
height_sex

# 데이터만
height_sex$여자

class(height_sex)

mean() : 리스트 평균 구하기¶

mean(height_sex)    # NA 없는 데이터가 있으면 평균을 내지 않는다
mean(height_sex[1])
mean(height_sex[[1]])

Warning message in mean.default(height_sex):
"argument is not numeric or logical: returning NA"

Warning message in mean.default(height_sex[1]):
"argument is not numeric or logical: returning NA"

sapply() : 리스트 항목별로 작업¶

# 리스트 항목별 평균 mean
height_sex_mean <- sapply(height_sex, mean)
height_sex_mean
height_sex_mean[1]
height_sex_mean[[1]]

# 리스트 항목별 표준편차 sd
height_sex_sd <- sapply(height_sex, sd)
height_sex_sd
height_sex_sd[2]
height_sex_sd[[2]]

# 리스트 항목별 범위 구하기
height_sex_range <- sapply(height_sex, range)
height_sex_range
height_sex_range[1,]  # 행
height_sex_range[,1]  # 열

명목형 변수 도수분포표 만들기¶

table() : 빈도수¶

# 혈액형별 빈도수 - table()
blood_type_table <- table(df$bloodtype)
blood_type_table

 A AB  B  O 
 4  3  5  5

prop.table() : 상대도수¶

# 혈액형별 상대도수 - prop.table()
blood_type_prop_table <- prop.table(blood_type_table)
blood_type_prop_table

        A        AB         B         O 
0.2352941 0.1764706 0.2941176 0.2941176

rbind() : 함께보기¶

blood_type_rbind <- rbind(blood_type_table, blood_type_prop_table)
blood_type_rbind

addmargins() : table 객체 합 구하기¶

# margin 값은 방향을 나타냄
# margin ＝１　－＞가로방향 (행 방향) / margin= 2 -> 세로방향(칼럼 방향)
blood_type_rbind_sum <- addmargins(blood_type_rbind, margin = 2)
blood_type_rbind_sum

연속형 변수 도수분포표 만들기¶

구간을 먼저 나눈다 : 계급

cut() : 구간 나누기¶

# 4개의 구간으로 나누기 - cut()
height_break_4 <- cut(df$height, breaks=4)
height_break_4

head(df$height) # 아래 키가 위에서 나눠진 구간 어디에 속하는지 보여주는 것

# 빈도수 - table()
height_break_4_table <- table(height_break_4)
height_break_4_table

height_break_4
(155,162] (162,169] (169,175] (175,182] 
        2         6         3         6

# 상대도수 - prop.table
height_break_4_prop_table <- prop.table(height_break_4_table)
height_break_4_prop_table

height_break_4
(155,162] (162,169] (169,175] (175,182] 
0.1176471 0.3529412 0.1764706 0.3529412

# 함께보기 -rbind()
height_break_4_rbind <- rbind(height_break_4_table, height_break_4_prop_table)
height_break_4_rbind

cumsum() : 누적상대도수¶

height_break_4_cumsum <- rbind(height_break_4_rbind, cumsum(height_break_4_rbind[2,]))
height_break_4_cumsum

rownames() : 열 이름 추가/변경¶

rownames(height_break_4_cumsum) <- c('도수','상대도수','누적도수')
height_break_4_cumsum

예제 studentlist2 도수분포표로 만들기¶

student_list2 <- read.csv('R-ggagi-data/example_studentlist2.csv')
#student_list2

# 도수 table()
student_list2_table <- table(student_list2$grade)
student_list2_table

 1  2  3  4 
16 37 31 16

# 상대도수 prop.table
student_list2_porp_table <- prop.table(student_list2_table)
student_list2_porp_table

   1    2    3    4 
0.16 0.37 0.31 0.16

# 함께보기 rbind
student_list2_rbind <- rbind(student_list2_table, student_list2_porp_table)
student_list2_rbind

# table 객체 합 구하기 addmargins
student_list2_rbind_sum <- addmargins(student_list2_rbind, margin = 2)
student_list2_rbind_sum

# 누적상대도수 cumsum
student_list2_cumsum <- rbind(student_list2_rbind, cumsum(student_list2_rbind[2,]))
student_list2_cumsum

# 열 이름 추가/변경 rownames
rownames(student_list2_cumsum) <- c('도수','상대도수','누적도수')
student_list2_cumsum

분할표 만들기 : 두 변수의 빈도수를 나타내는 표¶

df <- read.csv('R-ggagi-data/example_studentlist.csv')
#df

# 빈도수 구하기 table
sex_blood_table <- table(df$sex, df$bloodtype)
sex_blood_table

      
       A AB B O
  남자 2  2 4 2
  여자 2  1 1 3

# 도수 행, 열 합 구하기 addmargins
addmargins(sex_blood_table)

# margin ＝１　－＞가로방향 (행 방향) / margin= 2 -> 세로방향(칼럼 방향)
addmargins(sex_blood_table, margin=2)

# margin ＝１　－＞가로방향 (행 방향) / margin= 2 -> 세로방향(칼럼 방향)
addmargins(sex_blood_table, margin=1)

# 상대도수 구하기
sex_blood_prop_table <- prop.table(sex_blood_table)
sex_blood_prop_table

      
                A         AB          B          O
  남자 0.11764706 0.11764706 0.23529412 0.11764706
  여자 0.11764706 0.05882353 0.05882353 0.17647059

# 상대도수 합 구하기
addmargins(sex_blood_prop_table)

# 행별 상대도수 구하기
sex_blood_prop_table_margin_1 <- prop.table(sex_blood_table, margin=1)
sex_blood_prop_table_margin_1

      
               A        AB         B         O
  남자 0.2000000 0.2000000 0.4000000 0.2000000
  여자 0.2857143 0.1428571 0.1428571 0.4285714

# 열별 상대도수 구하기
sex_blood_prop_table_margin_2 <- prop.table(sex_blood_table, margin=2)
sex_blood_prop_table_margin_2

      
               A        AB         B         O
  남자 0.5000000 0.6666667 0.8000000 0.4000000
  여자 0.5000000 0.3333333 0.2000000 0.6000000

결측치 NA (값이 없는 경우) 처리¶

complete.cases() : NA값을 조사해 논리값으로 반환
na.omit() : 행에 NA가 있으면 행 삭제

a <- c(1,2,3,4,NA,6,7,8,9,10)
complete.cases(a)

# 빼 버려
a[complete.cases(a)]

na.omit(a)

bloodtype	height
O	165.3
AB	170.1
B	175.0
AB	182.1
A	168.0
O	162.0
O	155.2
A	176.9
B	178.5
B	176.1
O	167.1
AB	180.0
A	162.2
O	176.1
B	158.2
B	168.6
A	169.2

bloodtype	height
O	165.3
AB	170.1
B	175.0
AB	182.1
A	168.0
O	162.0
O	155.2
A	176.9
B	178.5
B	176.1
O	167.1
AB	180.0
A	162.2
O	176.1
B	158.2
B	168.6
A	169.2

name	footsize
강수친	245
김길동	270
김동수	265
김미진	235
김민수	270
김철수	280
박미희	240
박수호	NA
방희철	275
손세수	240
여수근	265
이미린	245
이철린	NA
이희수	245
이희진	245
임동민	280
홍길동	275

남자	여자
162.2	155.2
182.1	176.9

[R] 기술통계 (0)	2020.07.08
[R] 데이터 개념 이해하기 (2-2) (0)	2020.07.08
[R] 필수 패키지 설치 및 외부 문서 읽어오기 (0)	2020.07.02
[R] 데이터 개념 이해하기(기초) (0)	2020.07.02
[R] Python 가상환경 생성 및 R 주피터 노트북 연결 (0)	2020.07.02

IT공부 일지

[R] 데이터 개념 이해하기(2)

데이터 개념 이해하기¶

벡터 만들기¶

data frame 만들기¶

외부 데이터 가져오기 예제¶

subset() : 조건으로 변수 선택¶

select : 변수 선택해서 취하거나 버리기¶

colname() : 변수명 바꾸기¶

cbind() : 새로운 변수 추가¶

merge() : 2개의 데이터프레임 합치기¶

rbind() : 행으로 추가하기¶

list : 모든 종류의 데이터 객체 담기¶

class() : 자료형 확인¶

split()¶

mean() : 리스트 평균 구하기¶

sapply() : 리스트 항목별로 작업¶

명목형 변수 도수분포표 만들기¶

table() : 빈도수¶

prop.table() : 상대도수¶

rbind() : 함께보기¶

addmargins() : table 객체 합 구하기¶

연속형 변수 도수분포표 만들기¶

cut() : 구간 나누기¶

cumsum() : 누적상대도수¶

rownames() : 열 이름 추가/변경¶

예제 studentlist2 도수분포표로 만들기¶

분할표 만들기 : 두 변수의 빈도수를 나타내는 표¶

결측치 NA (값이 없는 경우) 처리¶

'R' 카테고리의 다른 글

댓글

티스토리툴바

name	sex	age	grade	absence	bloodtype	weight
김길동	남자	23	3	유	O	68.2
이미린	여자	22	2	무	AB	53.0
홍길동	남자	24	4	무	B	80.1
김철수	남자	23	3	무	AB	85.7
손세수	여자	20	1	유	A	49.5
박미희	여자	21	2	무	O	52.0
강수친	여자	22	1	무	O	45.3
이희수	여자	23	1	무	A	55.0
이철린	남자	23	3	무	B	64.2
방희철	남자	22	2	무	B	61.3
박수호	남자	24	4	유	O	62.0
임동민	남자	22	2	무	AB	75.8
김민수	남자	21	1	무	A	55.3
이희진	여자	23	3	무	O	53.1
김미진	여자	22	2	무	B	45.2
김동수	남자	24	4	유	B	70.2
여수근	남자	21	1	무	A	62.2

	A	AB	B	O
blood_type_table	4.0000000	3.0000000	5.0000000	5.0000000
blood_type_prop_table	0.2352941	0.1764706	0.2941176	0.2941176

	(155,162]	(162,169]	(169,175]	(175,182]
height_break_4_table	2.0000000	6.0000000	3.0000000	6.0000000
height_break_4_prop_table	0.1176471	0.3529412	0.1764706	0.3529412

	(155,162]	(162,169]	(169,175]	(175,182]
도수	2.0000000	6.0000000	3.0000000	6.0000000
상대도수	0.1176471	0.3529412	0.1764706	0.3529412
누적도수	0.1176471	0.4705882	0.6470588	1.0000000

	1	2	3	4
student_list2_table	16.00	37.00	31.00	16.00
student_list2_porp_table	0.16	0.37	0.31	0.16

	A	AB	B	O	Sum
남자	0.1176471	0.11764706	0.23529412	0.1176471	0.5882353
여자	0.1176471	0.05882353	0.05882353	0.1764706	0.4117647
Sum	0.2352941	0.17647059	0.29411765	0.2941176	1.0000000

a1	b1	c1
1	a	1.1
2	b	2.2
3	c	3.3
4	d	4.4
5	e	5.5

count	name	meanCount
1	a	1.1
2	b	2.2
3	c	3.3
4	d	4.4
5	e	5.5

[R] 데이터 개념 이해하기(2)

데이터 개념 이해하기¶

벡터 만들기¶

data frame 만들기¶

외부 데이터 가져오기 예제¶

subset() : 조건으로 변수 선택¶

select : 변수 선택해서 취하거나 버리기¶

colname() : 변수명 바꾸기¶

cbind() : 새로운 변수 추가¶

merge() : 2개의 데이터프레임 합치기¶

rbind() : 행으로 추가하기¶

list : 모든 종류의 데이터 객체 담기¶

class() : 자료형 확인¶

split()¶

mean() : 리스트 평균 구하기¶

sapply() : 리스트 항목별로 작업¶

명목형 변수 도수분포표 만들기¶

table() : 빈도수¶

prop.table() : 상대도수¶

rbind() : 함께보기¶

addmargins() : table 객체 합 구하기¶

연속형 변수 도수분포표 만들기¶

cut() : 구간 나누기¶

cumsum() : 누적상대도수¶

rownames() : 열 이름 추가/변경¶

예제 studentlist2 도수분포표로 만들기¶

분할표 만들기 : 두 변수의 빈도수를 나타내는 표¶

결측치 NA (값이 없는 경우) 처리¶

'R' 카테고리의 다른 글

관련글

댓글

티스토리툴바