LexFindR vignette

ZhaoBin Li, Anne Marie Crinnion, and James S. Magnuson

2024-06-16

library(LexFindR)

Overview

The LexFindR package implements R code to get various competitor types studied in psycholinguistics, including cohorts (get_cohorts), rhymes (get_rhymes), neighbors (get_neighbors), and words that embed within a target word (get_embeds_in_target) and words a target word embeds into (get_target_embeds_in).

The code uses regular expressions and balances speed and readability. By default, it is designed to handle complete pronunciation transcriptions (e.g., ARPAbet), in which pronunciations are coded in one or more ASCII characters separated by spaces. However, you can also use forms without delimiters, using the sep = ““ argument when appropriate. As shown in the vignette, alternative transcriptions can be easily converted to the designed transcriptions.

Installation

# Install LexFindR from CRAN
install.packages("LexFindR")

# Or the development version from GitHub:
# install.packages("devtools")
devtools::install_github("maglab-uconn/LexFindR")

Getting started

library(LexFindR)

# Get cohort index of ark in dictionary of ark, art and bab
target <- "AA R K"
lexicon <- c("AA R K", "AA R T", "B AA B")

cohort <- get_cohorts(target, lexicon)
cohort
#> [1] 1 2

# To get forms rather than indices using base R
lexicon[cohort]
#> [1] "AA R K" "AA R T"

# To get forms rather than indices using the form option
get_cohorts(target, lexicon, form = TRUE)
#> [1] "AA R K" "AA R T"

# Get count using base R
length(cohort)
#> [1] 2

# Get count using the count option
get_cohorts(target, lexicon, count = TRUE)
#> [1] 2

# Frequency weighting
target_freq <- 50
lexicon_freq <- c(50, 274, 45)

# get the summed log frequencies of competitors
get_fw(lexicon_freq)
#> [1] 13.33181

# 
get_fwcp(target_freq, lexicon_freq)
#> [1] 0.2934352

NOTE: On using ARPABET from the CMU Pronouncing Dictionary

# By default, CMU has numbers that indicate stress patterns
# 
# If you do not strip those out, instances of the same vowel
# with different stress numbers will be treated as different
# symbols. This may be useful for some purposes (e.g., finding
# cohorts or neighbors with the same stress pattern).
# 
# Here is a contrived example, where ARK will not be considered
# related to ART or BARK because of stress pattern differences
target <- "AA0 R K"
lexicon <- c("AA0 R K", "AA2 R T", "B AA3 R K")

get_cohorts(target, lexicon, form = TRUE)
#> [1] "AA0 R K"
get_neighbors(target, lexicon, form = TRUE)
#> [1] "AA0 R K"

# If this is not the behavior we want, we can strip lexical 
# stress indicators using regular expressions
target <- gsub("\\d", "", target)
lexicon <- gsub("\\d", "", lexicon)

print(target)
#> [1] "AA R K"
print(lexicon)
#> [1] "AA R K"   "AA R T"   "B AA R K"

get_cohorts(target, lexicon, form = TRUE)
#> [1] "AA R K" "AA R T"
get_neighbors(target, lexicon, form = TRUE)
#> [1] "AA R K"   "AA R T"   "B AA R K"

EXAMPLE: cohorts with the TRACE slex lexicon

This example shows how to do multiple steps at once.

library(tidyverse)
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ dplyr     1.1.4     ✔ readr     2.1.5
#> ✔ forcats   1.0.0     ✔ stringr   1.5.1
#> ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
#> ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
#> ✔ purrr     1.0.2     
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag()    masks stats::lag()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
glimpse(slex)
#> Rows: 212
#> Columns: 3
#> $ Item          <chr> "ad", "ar", "ark", "art", "art^st", "bab", "babi", "badi…
#> $ Pronunciation <chr> "AA D", "AA R", "AA R K", "AA R T", "AA R T AH S T", "B …
#> $ Frequency     <int> 53, 4406, 50, 274, 112, 45, 23, 341, 87, 125, 125, 95, 6…

# define the lexicon with the list of target words to compute
# cohorts for; we will use *target_df* instead of modifying
# slex or lemmalex directly
target_df <- slex

# specify the reference lexicon; here it is actually the list
# of pronunciations from slex, as we want to find all cohorts
# for all words in our lexicon. It is not necessary to create
# a new dataframe, but because we find it useful for more
# complex tasks, we use this approach here
lexicon_df <- target_df

# this instruction will create a new column in our target_df
# dataframe, "cohort_idx", which will be the list of lexicon_df
# indices corresponding to each word's cohort set
target_df$cohort_idx <-
  lapply(
    # in each lapply instance, select the target pronunciation
    target_df$Pronunciation,
    # in each lapply instance, apply the get_cohorts function
    FUN = get_cohorts,
    # in each lapply instance, compare the current target 
    # Pronunciation to each lexicon Pronunciation
    lexicon = lexicon_df$Pronunciation
  )

# let's look at the first few instances in each field...
glimpse(target_df)
#> Rows: 212
#> Columns: 4
#> $ Item          <chr> "ad", "ar", "ark", "art", "art^st", "bab", "babi", "badi…
#> $ Pronunciation <chr> "AA D", "AA R", "AA R K", "AA R T", "AA R T AH S T", "B …
#> $ Frequency     <int> 53, 4406, 50, 274, 112, 45, 23, 341, 87, 125, 125, 95, 6…
#> $ cohort_idx    <list> 1, <2, 3, 4, 5>, <2, 3, 4, 5>, <2, 3, 4, 5>, <2, 3, 4, …

EXAMPLE: rhymes with the TRACE slex lexicon and tidyverse piping style

tidyverse piping style is more readable.

slex_rhymes <- slex %>% mutate(
  rhyme_idx = lapply(Pronunciation, get_rhymes, lexicon = Pronunciation),
  rhyme_str = lapply(rhyme_idx, function(idx) {
    Item[idx]
  }),
  rhyme_count = lengths(rhyme_idx)
)

glimpse(slex_rhymes)
#> Rows: 212
#> Columns: 6
#> $ Item          <chr> "ad", "ar", "ark", "art", "art^st", "bab", "babi", "badi…
#> $ Pronunciation <chr> "AA D", "AA R", "AA R K", "AA R T", "AA R T AH S T", "B …
#> $ Frequency     <int> 53, 4406, 50, 274, 112, 45, 23, 341, 87, 125, 125, 95, 6…
#> $ rhyme_idx     <list> <1, 44, 130>, <2, 10, 69, 104, 184>, <3, 11, 29, 106>, …
#> $ rhyme_str     <list> <"ad", "gad", "rad">, <"ar", "bar", "kar", "par", "tar"…
#> $ rhyme_count   <int> 3, 5, 4, 3, 1, 2, 2, 1, 1, 5, 4, 1, 6, 3, 4, 3, 3, 1, 2,…

slex_rhymes <- slex_rhymes %>%
  rowwise() %>%
  mutate(
    rhyme_freq = list(slex$Frequency[rhyme_idx]),
    rhyme_fw = get_fw(rhyme_freq),
    rhyme_fwcp = get_fwcp(Frequency, rhyme_freq)
  ) %>% 
  ungroup()

glimpse(slex_rhymes)
#> Rows: 212
#> Columns: 9
#> $ Item          <chr> "ad", "ar", "ark", "art", "art^st", "bab", "babi", "badi…
#> $ Pronunciation <chr> "AA D", "AA R", "AA R K", "AA R T", "AA R T AH S T", "B …
#> $ Frequency     <int> 53, 4406, 50, 274, 112, 45, 23, 341, 87, 125, 125, 95, 6…
#> $ rhyme_idx     <list> <1, 44, 130>, <2, 10, 69, 104, 184>, <3, 11, 29, 106>, …
#> $ rhyme_str     <list> <"ad", "gad", "rad">, <"ar", "bar", "kar", "par", "tar"…
#> $ rhyme_count   <int> 3, 5, 4, 3, 1, 2, 2, 1, 1, 5, 4, 1, 6, 3, 4, 3, 3, 1, 2,…
#> $ rhyme_freq    <list> <53, 332, 29>, <4406, 125, 386, 10, 20>, <50, 125, 234,…
#> $ rhyme_fw      <dbl> 13.142723, 24.473191, 19.684596, 15.046612, 4.718499, 7.…
#> $ rhyme_fwcp    <dbl> 0.3020905, 0.3428536, 0.1987352, 0.3730493, 1.0000000, 0…

EXAMPLE: Using parallelization

library(future.apply)
library(tictoc)

# using two cores for demo or else 
# set `workers` to availableCores() to use all cores
plan(multisession, workers = 2)

glimpse(lemmalex)
#> Rows: 17,750
#> Columns: 3
#> $ Item          <chr> "a", "abandon", "abandonment", "abate", "abbey", "abbot"…
#> $ Frequency     <dbl> 20415.27, 8.10, 0.96, 0.10, 3.18, 0.84, 0.02, 0.24, 3.35…
#> $ Pronunciation <chr> "AH", "AH B AE N D IH N", "AH B AE N D AH N M AH N T", "…


# the portion between tic and toc below takes ~X seconds on a 
# 15-inch Macbook Pro 6-core i9; if you replace future_lapply 
# with lapply, it takes ~317 secs, v. 66 secs with future_lapply

tic("Finding rhymes")

slex_rhyme_lemmalex <- lemmalex %>% mutate(
  rhyme = future_lapply(Pronunciation, get_rhymes, 
                            lexicon = lemmalex$Pronunciation),
  rhyme_str = lapply(rhyme, function(idx) {
    lemmalex$Item[idx]
  }),
  rhyme_len = lengths(rhyme)
)

toc()
#> Finding rhymes: 37.793 sec elapsed

glimpse(slex_rhyme_lemmalex)
#> Rows: 17,750
#> Columns: 6
#> $ Item          <chr> "a", "abandon", "abandonment", "abate", "abbey", "abbot"…
#> $ Frequency     <dbl> 20415.27, 8.10, 0.96, 0.10, 3.18, 0.84, 0.02, 0.24, 3.35…
#> $ Pronunciation <chr> "AH", "AH B AE N D IH N", "AH B AE N D AH N M AH N T", "…
#> $ rhyme         <list> <1, 8846, 15769>, 2, 3, <4, 1136>, <5, 1092, 1285, 1331…
#> $ rhyme_str     <list> <"a", "le", "the">, "abandon", "abandonment", <"abate",…
#> $ rhyme_len     <int> 3, 1, 1, 2, 5, 3, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1,…

EXAMPLE: Putting it all together

This extended example is from a paper describing LexFindR to be submitted in Fall, 2020.

library(LexFindR)
library(tidyverse) # for glimpse
library(future.apply) # parallelization
library(tictoc) # timing utilities

# In this example, we define a dataframe source for target words
# (target_df) and another for the lexicon to compare the target
# words to (lexicon_df). Often, these will be the same, but we keep
# them separate here to make it easier for others to generalize from
# this example code.

# Code assumes you have at least 3 columns in target_df & lexicon_df:
# 1. Item -- a label of some sort, can be identical to Pronunciation
# 2. Pronunciation -- typically a phonological form
# 3. Frequency -- should be in occurrences per million, or some other
#                 raw form, as the functions below take the log of
#                 the frequency form. See advice about padding in
#                 the main article text.
#
# Of course, you can name your fields as you like, and edit the
# field names below appropriately.
target_df <- slex
lexicon_df <- target_df

# Prepare for parallelizing
# 1. how many cores do we have?
# num_cores <- availableCores()

# using two cores for demo
num_cores <- 2

print(paste0("Using num_cores: ", num_cores))
#> [1] "Using num_cores: 2"
# 2. now let future.apply figure out how to optimize parallel
#    division of labor over cores
plan(multisession, workers = num_cores)

# the functions in this list all return lists of word indices; the
# uniqueness point function is not included because it returns a
# single value per word.
fun_list <- c(
  "cohorts", "neighbors",
  "rhymes", "homoforms",
  "target_embeds_in", "embeds_in_target",
  "nohorts", "cohortsP", "neighborsP",
  "target_embeds_inP", "embeds_in_targetP"
)

# we need to keep track of the P variants, as we need to tell get_fwcp
# to add in the target frequency for these, as they exclude the target
Ps <- c(
  "cohortsP", "neighborsP", "target_embeds_inP",
  "embeds_in_targetP"
)

# determine how much to pad based on minimum frequency
if (min(target_df$Frequency) == 0) {
  pad <- 2
} else if (min(target_df$Frequency) < 1) {
  pad <- 1
} else {
  pad <- 0
}

# now let's loop through the functions
for (fun_name in fun_list) {
  # start timer for this function
  tic(fun_name)

  # the P functions do not include the target in the denominator for
  # get_fwcp; if we want this to be a consistent ratio, we need to
  # add target frequency to the denominator
  add_target <- FALSE
  if (fun_name %in% Ps) {
    add_target <- TRUE
  }

  # inform the user that we are starting the next function, make sure
  # we are correctly adding target or not
  cat("Starting", fun_name, " -- add_target = ", add_target)
  func <- paste0("get_", fun_name)

  # use *future_lapply* to do the competitor search, creating
  # a new column in *target_df* that will be this function's
  # name + _idx (e.g., cohort_idx)
  target_df[[paste0(fun_name, "_idx")]] <-
    future_lapply(target_df$Pronunciation,
      FUN = get(func),
      lexicon = lexicon_df$Pronunciation
    )

  # list the competitor form labels in functionname_str
  target_df[[paste0(fun_name, "_str")]] <- lapply(
    target_df[[paste0(fun_name, "_idx")]],
    function(idx) {
      lexicon_df$Item[idx]
    }
  )

  # list the competitor frequencies in functionname_freq
  target_df[[paste0(fun_name, "_freq")]] <- lapply(
    target_df[[paste0(fun_name, "_idx")]],
    function(idx) {
      lexicon_df$Frequency[idx]
    }
  )

  # put the count of competitors in functionname_num
  target_df[[paste0(fun_name, "_num")]] <-
    lengths(target_df[[paste0(fun_name, "_idx")]])

  # put the FW in functionname_fwt
  target_df[[paste0(fun_name, "_fwt")]] <-
    mapply(get_fw,
      competitors_freq = target_df[[paste0(fun_name, "_freq")]],
      pad = pad
    )

  # put the FWCP in functionname_fwcp
  target_df[[paste0(fun_name, "_fwcp")]] <-
    mapply(get_fwcp,
      target_freq = target_df$Frequency,
      competitors_freq = target_df[[paste0(fun_name, "_freq")]],
      pad = pad, add_target = add_target
    )

  toc()
}
#> Starting cohorts  -- add_target =  FALSEcohorts: 0.084 sec elapsed
#> Starting neighbors  -- add_target =  FALSEneighbors: 0.067 sec elapsed
#> Starting rhymes  -- add_target =  FALSErhymes: 0.043 sec elapsed
#> Starting homoforms  -- add_target =  FALSEhomoforms: 0.035 sec elapsed
#> Starting target_embeds_in  -- add_target =  FALSEtarget_embeds_in: 0.039 sec elapsed
#> Starting embeds_in_target  -- add_target =  FALSEembeds_in_target: 0.042 sec elapsed
#> Starting nohorts  -- add_target =  FALSEnohorts: 0.059 sec elapsed
#> Starting cohortsP  -- add_target =  TRUEcohortsP: 0.06 sec elapsed
#> Starting neighborsP  -- add_target =  TRUEneighborsP: 0.073 sec elapsed
#> Starting target_embeds_inP  -- add_target =  TRUEtarget_embeds_inP: 0.099 sec elapsed
#> Starting embeds_in_targetP  -- add_target =  TRUEembeds_in_targetP: 0.079 sec elapsed

# Note that get_neighborsP excludes rhymes. If you do not want to 
# track rhymes separately and want neighborsP to include all 
# rhymes that are not cohorts, you can create new fields that 
# combine them, as we do here, creating "Pr" versions
target_df$neighborsPr_num = target_df$neighborsP_num + target_df$rhymes_num
target_df$neighborsPr_fwcp = target_df$neighborsP_fwcp + target_df$rhymes_fwcp
target_df$neighborsPr_fwt = target_df$neighborsP_fwt + target_df$rhymes_fwt

# Now let's streamline the dataframe; we'll select the num, fwt, and fwcp
# columns and put them in that order, while not keeping some of the other
# 'helper' columns we created

export_df <- target_df %>% 
  select(Item | Pronunciation | Frequency 
         | ends_with("_num") | ends_with("_fwt") | ends_with("_fwcp"))

glimpse(export_df)
#> Rows: 212
#> Columns: 39
#> $ Item                   <chr> "ad", "ar", "ark", "art", "art^st", "bab", "bab…
#> $ Pronunciation          <chr> "AA D", "AA R", "AA R K", "AA R T", "AA R T AH …
#> $ Frequency              <int> 53, 4406, 50, 274, 112, 45, 23, 341, 87, 125, 1…
#> $ cohorts_num            <int> 1, 4, 4, 4, 4, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3,…
#> $ neighbors_num          <int> 4, 8, 6, 5, 1, 4, 4, 2, 1, 7, 5, 1, 7, 5, 8, 3,…
#> $ rhymes_num             <int> 3, 5, 4, 3, 1, 2, 2, 1, 1, 5, 4, 1, 6, 3, 4, 3,…
#> $ homoforms_num          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
#> $ target_embeds_in_num   <int> 6, 29, 5, 9, 1, 2, 1, 1, 1, 2, 1, 1, 5, 1, 1, 1…
#> $ embeds_in_target_num   <int> 1, 1, 2, 2, 5, 1, 3, 2, 1, 2, 4, 2, 1, 3, 3, 2,…
#> $ nohorts_num            <int> 1, 3, 3, 3, 1, 3, 3, 2, 1, 3, 2, 1, 2, 2, 3, 1,…
#> $ cohortsP_num           <int> 0, 1, 1, 1, 3, 4, 4, 5, 6, 4, 5, 6, 1, 1, 0, 2,…
#> $ neighborsP_num         <int> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0,…
#> $ target_embeds_inP_num  <int> 3, 21, 1, 5, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0…
#> $ embeds_in_targetP_num  <int> 0, 0, 0, 0, 2, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,…
#> $ neighborsPr_num        <int> 4, 6, 4, 3, 1, 2, 2, 1, 1, 5, 4, 1, 6, 4, 6, 3,…
#> $ cohorts_fwt            <dbl> 3.970292, 22.634373, 22.634373, 22.634373, 22.6…
#> $ neighbors_fwt          <dbl> 21.533445, 37.968634, 33.688446, 27.349358, 4.7…
#> $ rhymes_fwt             <dbl> 13.142723, 24.473191, 19.684596, 15.046612, 4.7…
#> $ homoforms_fwt          <dbl> 3.970292, 8.390723, 3.912023, 5.613128, 4.71849…
#> $ target_embeds_in_fwt   <dbl> 29.792782, 127.685319, 22.680328, 42.517044, 4.…
#> $ embeds_in_target_fwt   <dbl> 3.970292, 8.390723, 12.302746, 14.003851, 35.28…
#> $ nohorts_fwt            <dbl> 3.970292, 17.915874, 17.915874, 17.915874, 4.71…
#> $ cohortsP_fwt           <dbl> 0.000000, 4.718499, 4.718499, 4.718499, 17.9158…
#> $ neighborsP_fwt         <dbl> 8.390723, 3.970292, 0.000000, 0.000000, 0.00000…
#> $ target_embeds_inP_fwt  <dbl> 16.650059, 88.968478, 2.995732, 22.751933, 0.00…
#> $ embeds_in_targetP_fwt  <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 16.5642…
#> $ neighborsPr_fwt        <dbl> 21.533445, 28.443483, 19.684596, 15.046612, 4.7…
#> $ cohorts_fwcp           <dbl> 1.00000000, 0.37070710, 0.17283550, 0.24799133,…
#> $ neighbors_fwcp         <dbl> 0.1843779, 0.2209909, 0.1161236, 0.2052380, 1.0…
#> $ rhymes_fwcp            <dbl> 0.3020905, 0.3428536, 0.1987352, 0.3730493, 1.0…
#> $ homoforms_fwcp         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
#> $ target_embeds_in_fwcp  <dbl> 0.13326355, 0.06571407, 0.17248529, 0.13202066,…
#> $ embeds_in_target_fwcp  <dbl> 1.0000000, 1.0000000, 0.3179797, 0.4008275, 0.1…
#> $ nohorts_fwcp           <dbl> 1.0000000, 0.4683401, 0.2183551, 0.3133047, 1.0…
#> $ cohortsP_fwcp          <dbl> 1.0000000, 0.6400626, 0.4532777, 0.5432957, 0.2…
#> $ neighborsP_fwcp        <dbl> 0.3211947, 0.6788053, 1.0000000, 1.0000000, 1.0…
#> $ target_embeds_inP_fwcp <dbl> 0.19254240, 0.08618315, 0.56632333, 0.19788881,…
#> $ embeds_in_targetP_fwcp <dbl> 1.0000000, 1.0000000, 1.0000000, 1.0000000, 0.2…
#> $ neighborsPr_fwcp       <dbl> 0.6232852, 1.0216590, 1.1987352, 1.3730493, 2.0…