15 - Regex

Psicometria per le Neuroscienze Cognitive
GPT summaries of R4DS Books

Filippo Gambarota, PhD

Data and packages

library(tidyverse)
library(babynames)

Data and packages

  • fruit contains the names of 80 fruits.
  • words contains 980 common English words.
  • sentences contains 720 short sentences.

Pattern basics

str_view(fruit, "berry")
 [6] │ bil<berry>
 [7] │ black<berry>
[10] │ blue<berry>
[11] │ boysen<berry>
[19] │ cloud<berry>
[21] │ cran<berry>
[29] │ elder<berry>
[32] │ goji <berry>
[33] │ goose<berry>
[38] │ huckle<berry>
[50] │ mul<berry>
[70] │ rasp<berry>
[73] │ salal <berry>
[76] │ straw<berry>
str_view(c("a", "ab", "ae", "bd", "ea", "eab"), "a.")
[2] │ <ab>
[3] │ <ae>
[6] │ e<ab>
str_view(fruit, "a...e")
 [1] │ <apple>
 [7] │ bl<ackbe>rry
[48] │ mand<arine>
[51] │ nect<arine>
[62] │ pine<apple>
[64] │ pomegr<anate>
[70] │ r<aspbe>rry
[73] │ sal<al be>rry

Pattern basics

Quantifiers control how many times a pattern can match:

  • ? makes a pattern optional (i.e. it matches 0 or 1 times)
  • + lets a pattern repeat (i.e. it matches at least once)
  • * lets a pattern be optional or repeat (i.e. it matches any number of times, including 0).
# ab? matches an "a", optionally followed by a "b".
str_view(c("a", "ab", "abb"), "ab?")
[1] │ <a>
[2] │ <ab>
[3] │ <ab>b
# ab+ matches an "a", followed by at least one "b".
str_view(c("a", "ab", "abb"), "ab+")
[2] │ <ab>
[3] │ <abb>
# ab* matches an "a", followed by any number of "b"s.
str_view(c("a", "ab", "abb"), "ab*")
[1] │ <a>
[2] │ <ab>
[3] │ <abb>

Pattern basics

str_view(words, "[aeiou]x[aeiou]")
[284] │ <exa>ct
[285] │ <exa>mple
[288] │ <exe>rcise
[289] │ <exi>st
str_view(words, "[^aeiou]y[^aeiou]")
[836] │ <sys>tem
[901] │ <typ>e

Pattern basics

str_view(fruit, "apple|melon|nut")
 [1] │ <apple>
[13] │ canary <melon>
[20] │ coco<nut>
[52] │ <nut>
[62] │ pine<apple>
[72] │ rock <melon>
[80] │ water<melon>
str_view(fruit, "aa|ee|ii|oo|uu")
 [9] │ bl<oo>d orange
[33] │ g<oo>seberry
[47] │ lych<ee>
[66] │ purple mangost<ee>n

Detect matches

str_detect(c("a", "b", "c"), "[aeiou]")
[1]  TRUE FALSE FALSE

Detect matches

babynames |> 
  filter(str_detect(name, "x")) |> 
  count(name, wt = n, sort = TRUE)
# A tibble: 974 × 2
   name            n
   <chr>       <int>
 1 Alexander  665492
 2 Alexis     399551
 3 Alex       278705
 4 Alexandra  232223
 5 Max        148787
 6 Alexa      123032
 7 Maxine     112261
 8 Alexandria  97679
 9 Maxwell     90486
10 Jaxon       71234
# ℹ 964 more rows

Detect matches

babynames |> 
  group_by(year) |> 
  summarize(prop_x = mean(str_detect(name, "x")))
# A tibble: 138 × 2
    year  prop_x
   <dbl>   <dbl>
 1  1880 0.0065 
 2  1881 0.00879
 3  1882 0.00940
 4  1883 0.00768
 5  1884 0.00827
 6  1885 0.00872
 7  1886 0.00878
 8  1887 0.00801
 9  1888 0.00905
10  1889 0.00888
# ℹ 128 more rows

Count matches

x <- c("apple", "banana", "pear")
str_count(x, "p")
[1] 2 0 1

Count matches

str_count("abababa", "aba")
[1] 2
str_view("abababa", "aba")
[1] │ <aba>b<aba>

Count matches

babynames |> 
  count(name) |> 
  mutate(
    vowels = str_count(name, "[aeiou]"),
    consonants = str_count(name, "[^aeiou]")
  )
# A tibble: 97,310 × 4
   name          n vowels consonants
   <chr>     <int>  <int>      <int>
 1 Aaban        10      2          3
 2 Aabha         5      2          3
 3 Aabid         2      2          3
 4 Aabir         1      2          3
 5 Aabriella     5      4          5
 6 Aada          1      2          2
 7 Aadam        26      2          3
 8 Aadan        11      2          3
 9 Aadarsh      17      2          5
10 Aaden        18      2          3
# ℹ 97,300 more rows

Count matches

babynames |> 
  count(name) |> 
  mutate(
    name = str_to_lower(name),
    vowels = str_count(name, "[aeiou]"),
    consonants = str_count(name, "[^aeiou]")
  )
# A tibble: 97,310 × 4
   name          n vowels consonants
   <chr>     <int>  <int>      <int>
 1 aaban        10      3          2
 2 aabha         5      3          2
 3 aabid         2      3          2
 4 aabir         1      3          2
 5 aabriella     5      5          4
 6 aada          1      3          1
 7 aadam        26      3          2
 8 aadan        11      3          2
 9 aadarsh      17      3          4
10 aaden        18      3          2
# ℹ 97,300 more rows

Replace values

x <- c("apple", "pear", "banana")
str_replace_all(x, "[aeiou]", "-")
[1] "-ppl-"  "p--r"   "b-n-n-"

Replace values

x <- c("apple", "pear", "banana")
str_remove_all(x, "[aeiou]")
[1] "ppl" "pr"  "bnn"

Anchors

str_view(fruit, "^a")
[1] │ <a>pple
[2] │ <a>pricot
[3] │ <a>vocado
str_view(fruit, "a$")
 [4] │ banan<a>
[15] │ cherimoy<a>
[30] │ feijo<a>
[36] │ guav<a>
[56] │ papay<a>
[74] │ satsum<a>

Anchors

str_view(fruit, "apple")
 [1] │ <apple>
[62] │ pine<apple>
str_view(fruit, "^apple$")
[1] │ <apple>