[R기초] 문자열 처리 패키지(stringr)

티스토리 뷰

카테고리 없음

[R기초] 문자열 처리 패키지(stringr)

훈데이텀 2019. 2. 21. 18:25

strings.pdf

Detect Matches

> library(data.table)

> library(stringr)

# 함수 실습을 위해 간단한 데이터테이블을 만든다.

> string <- data.table(member = c("mother", "father",

+ "grandmother", "grandfather",

+ "son 1", "daughter 1",

+ "son 2", "daughter 2","uncle"),

+ old = c(50, 55, 80, 84, 14, 20, 15,18,40))

> string

member old

1: mother 50

2: father 55

3: grandmother 80

4: grandfather 84

5: son 1 14

6: daughter 1 20

7: son 2 15

8: daughter 2 18

9: uncle 40

# str_detect(string, pattern) : 문자열(string)에서 pattern을 비교하여 진릿값(TRUE/FALSE) 를 알려준다.

> str_detect(string$member, "mo")

[1] TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE

# str_which(string, pattern) : 문자열(string)에서 pattern을 비교하여 TRUE인 인덱스를 알려준다

> str_which(string$member, "mo")

[1] 1 3

# str_detect, str_which 응용

> mother_index1 <- str_detect(string$member, "mo")

> mother_index2 <- str_which(string$member, "mo")

> string[mother_index1]

member old

1: mother 50

2: grandmother 80

> string[mother_index2]

member old

1: mother 50

2: grandmother 80

# str_count(string, pattern) : string에서 pattern의 갯수를 counting 해준다.

> str_count(string$member, "a")

[1] 0 1 1 2 0 1 0 1 0

# str_locate(string, pattern) : string에서 pattern의 위치를 알려준다.(행렬 반환)

> str_locate(string$member, "a")

start end

[1,] NA NA

[2,] 2 2

[3,] 3 3

[4,] 3 3

[5,] NA NA

[6,] 2 2

[7,] NA NA

[8,] 2 2

[9,] NA NA

# str_locate_all(string, pattern) : string에서 pattern의 위치를 알려준다.(리스트 반환)

> str_locate_all(string$member, "a")

[[1]]

start end

[[2]]

start end

[1,] 2 2

[[3]]

start end

[1,] 3 3

Subset Strings

#str_sub(string, start = , end = ) : start 부터 end까지 나타낸다.

> str_sub(string$member, start = 1, end = 3)

[1] "mot" "fat" "gra" "gra" "son" "dau" "son" "dau" "unc"

#str_subset(string, pattern) : 문자열(string)에서 pattern을 포함하는 것을 나타낸다.

> str_subset(string$member, "fa")

[1] "father" "grandfather"

#str_subset 응용

> father_subset <- str_subset(string$member, "fa")

> string[member %in% father_subset]

member old

1: father 55

2: grandfather 84

#str_extract(string, pattern) : 첫번째 pattern을 벡터 형식으로 나타낸다.

> str_extract(string$member, "[a-f]")

[1] "e" "f" "a" "a" NA "d" NA "d" "c"

#str_extract(string, pattern) : 모든 pattern을 리스트 형식으로 나타낸다.

> str_extract_all(string$member, "[a-f]")

[[1]]

[1] "e"

[[2]]

[1] "f" "a" "e"

#str_match(string, pattern) : 첫번째 pattern을 행렬 형식으로 나타낸다.

> str_match(string$member, "[a-f]")

[,1]

[1,] "e"

[2,] "f"

[3,] "a"

[4,] "a"

[5,] NA

[6,] "d"

[7,] NA

[8,] "d"

[9,] "c"

#str_match_all(string, pattern) : 모든 pattern을 리스트 형식으로 나타낸다.

> str_match_all(string$member, "[a-f]")

[[1]]

[,1]

[1,] "e"

[[2]]

[,1]

[1,] "f"

[2,] "a"

[3,] "e"

Manage Lengths

> str_length(string$member)

[1] 6 6 11 11 5 10 5 10 5

> str_pad(string$member, 15, side = "right")

[1] "mother " "father " "grandmother " "grandfather " "son 1 "

[6] "daughter 1 " "son 2 " "daughter 2 " "uncle "

> str_pad(string$member, 15, side = "left")

[1] " mother" " father" " grandmother" " grandfather" " son 1"

[6] " daughter 1" " son 2" " daughter 2" " uncle"

> str_pad(string$member, 15, side = "both")

[1] " mother " " father " " grandmother " " grandfather " " son 1 "

[6] " daughter 1 " " son 2 " " daughter 2 " " uncle "

> str_pad(string$member, 15, side = "both", pad = ".")

[1] "....mother....." "....father....." "..grandmother.." "..grandfather.." ".....son 1....."

[6] "..daughter 1..." ".....son 2....." "..daughter 2..." ".....uncle....."

> str_trunc(string$member, 5, side = "right")

[1] "mo..." "fa..." "gr..." "gr..." "son 1" "da..." "son 2" "da..." "uncle"

> str_trunc(string$member, 5, side = "left")

[1] "...er" "...er" "...er" "...er" "son 1" "... 1" "son 2" "... 2" "uncle"

> str_trunc(string$member, 5, side = "center")

[1] "m...r" "f...r" "g...r" "g...r" "son 1" "d...1" "son 2" "d...2" "uncle"

> str_trunc(string$member, 5, side = "center", ellipsis = ".")

[1] "mo.er" "fa.er" "gr.er" "gr.er" "son 1" "da. 1" "son 2" "da. 2" "uncle"

Mutate Strings

> string_sub <- string

> str_sub(string_sub$member, start = 1, end = 4) <- "xxxx"

> string_sub

member old

1: xxxxer 50

2: xxxxer 55

3: xxxxdmother 80

4: xxxxdfather 84

5: xxxx1 14

6: xxxxhter 1 20

7: xxxx2 15

8: xxxxhter 2 18

9: xxxxe 40

> string_sub <- string

> str_replace(string_sub$member, "a", "-")

[1] "mother" "f-ther" "gr-ndmother" "gr-ndfather" "son 1" "d-ughter 1" "son 2"

[8] "d-ughter 2" "uncle"

> string_sub <- string

> str_replace_all(string_sub$member, "a", "-")

[1] "mother" "f-ther" "gr-ndmother" "gr-ndf-ther" "son 1" "d-ughter 1" "son 2"

[8] "d-ughter 2" "uncle"

> string_sub <- string

> str_to_upper(string_sub$member, locale = "en")

[1] "MOTHER" "FATHER" "GRANDMOTHER" "GRANDFATHER" "SON 1" "DAUGHTER 1" "SON 2" [8] "DAUGHTER 2" "UNCLE"

> str_to_lower(string_sub$member, locale = "en")

[1] "mother" "father" "grandmother" "grandfather" "son 1" "daughter 1" "son 2"

[8] "daughter 2" "uncle"

> str_to_title(string_sub$member, locale = "en")

[1] "Mother" "Father" "Grandmother" "Grandfather" "Son 1" "Daughter 1" "Son 2"

[8] "Daughter 2" "Uncle"

> str_c(string_sub$member, collapse = "")

[1] "motherfathergrandmothergrandfatherson 1daughter 1son 2daughter 2uncle"

> string_sub

member old

1: mother 50

2: father 55

3: grandmother 80

4: grandfather 84

5: son 1 14

6: daughter 1 20

7: son 2 15

8: daughter 2 18

9: uncle 40

> str_dup(string_sub$member,times = 2)

[1] "mothermother" "fatherfather" "grandmothergrandmother" "grandfathergrandfather"

[5] "son 1son 1" "daughter 1daughter 1" "son 2son 2" "daughter 2daughter 2"

[9] "uncleuncle"

> str_split_fixed(string_sub$member, " ", n = 2) #matrix 형태

[,1] [,2]

[1,] "mother" ""

[2,] "father" ""

[3,] "grandmother" ""

[4,] "grandfather" ""

[5,] "son" "1"

[6,] "daughter" "1"

[7,] "son" "2"

[8,] "daughter" "2"

[9,] "uncle" ""

> str_split(string_sub$member, " ", n = 2) #list 형태

[[1]]

[1] "mother"

[[2]]

[1] "father"

[[3]]

[1] "grandmother"

[[4]]

[1] "grandfather"

[[5]]

[1] "son" "1"

[[6]]

[1] "daughter" "1"

[[7]]

[1] "son" "2"

[[8]]

[1] "daughter" "2"

[[9]]

[1] "uncle"

> str_order(string_sub$member, decreasing = FALSE)

[1] 6 8 2 4 3 1 5 7 9

> str_order(string_sub$member, decreasing = TRUE)

[1] 9 7 5 1 3 4 2 8 6

> member_order <- str_order(string_sub$member, decreasing = FALSE)

> string_sub$member[member_order]

[1] "daughter 1" "daughter 2" "father" "grandfather" "grandmother" "mother" "son 1"

[8] "son 2" "uncle"

공지사항

최근에 올라온 글

최근에 달린 댓글

Total

Today

Yesterday

링크

꿈은 없구요 놀고 싶습니다.

TAG more

« 2025/04 »
일	월	화	수	목	금	토
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30

글 보관함

데이터세상

티스토리 뷰

[R기초] 문자열 처리 패키지(stringr)

티스토리툴바