Title: | Accessing the Wordbank Database |
---|---|
Description: | Connecting to Wordbank, an open repository for developmental vocabulary data. For more information on the underlying data, see <http://wordbank.stanford.edu>. |
Authors: | Mika Braginsky [aut, cre], Daniel Yurovsky [ctb], Michael Frank [ctb], Danielle Kellier [ctb], Alvin Tan [ctb] |
Maintainer: | Mika Braginsky <[email protected]> |
License: | GPL-3 |
Version: | 1.0.3.9000 |
Built: | 2024-11-19 03:10:17 UTC |
Source: | https://github.com/langcog/wordbankr |
Connect to the Wordbank database
connect_to_wordbank(db_args = NULL)
connect_to_wordbank(db_args = NULL)
db_args |
List with arguments to connect to wordbank mysql database (host, dbname, user, and password). |
A src
object which is connection to the Wordbank database.
src <- connect_to_wordbank()
src <- connect_to_wordbank()
For each item in the input data, estimate its age of acquisition as the earliest age (in months) at which the proportion of children who understand/produce the item is greater than some threshold. The proportions used can be empirical or first smoothed by a model.
fit_aoa( instrument_data, measure = "produces", method = "glm", proportion = 0.5, age_min = min(instrument_data$age, na.rm = TRUE), age_max = max(instrument_data$age, na.rm = TRUE) )
fit_aoa( instrument_data, measure = "produces", method = "glm", proportion = 0.5, age_min = min(instrument_data$age, na.rm = TRUE), age_max = max(instrument_data$age, na.rm = TRUE) )
instrument_data |
A data frame returned by |
measure |
One of "produces" or "understands" (defaults to "produces"). |
method |
A string indicating which smoothing method to use:
|
proportion |
A number between 0 and 1 indicating threshold proportion of children. |
age_min |
The minimum age to allow for an age of acquisition. Defaults
to the minimum age in |
age_max |
The maximum age to allow for an age of acquisition. Defaults
to the maximum age in |
A data frame where every row is an item, the item-level columns from
the input data are preserved, and the aoa
column contains the age of
acquisition estimates.
eng_ws_data <- get_instrument_data(language = "English (American)", form = "WS", items = c("item_1", "item_42"), administration_info = TRUE) if (!is.null(eng_ws_data)) eng_ws_aoa <- fit_aoa(eng_ws_data)
eng_ws_data <- get_instrument_data(language = "English (American)", form = "WS", items = c("item_1", "item_42"), administration_info = TRUE) if (!is.null(eng_ws_data)) eng_ws_aoa <- fit_aoa(eng_ws_data)
Fit quantiles to vocabulary sizes using quantile regression
fit_vocab_quantiles(vocab_data, measure, group, quantiles = "standard")
fit_vocab_quantiles(vocab_data, measure, group, quantiles = "standard")
vocab_data |
A data frame returned by |
measure |
A column of |
group |
(Optional) A column of |
quantiles |
Either one of "standard" (default), "deciles", "quintiles", "quartiles", "median", or a numeric vector of quantile values. |
A data frame with the columns "language", "form", "age", group
(if specified), "quantile", and measure
, where measure
is the
fit vocabulary value for that quantile at that age.
eng_wg <- get_administration_data(language = "English (American)", form = "WG", include_demographic_info = TRUE) if (!is.null(eng_wg)) { vocab_quantiles <- fit_vocab_quantiles(eng_wg, production) vocab_quantiles_sex <- fit_vocab_quantiles(eng_wg, production, sex) vocab_quartiles <- fit_vocab_quantiles(eng_wg, production, quantiles = "quartiles") }
eng_wg <- get_administration_data(language = "English (American)", form = "WG", include_demographic_info = TRUE) if (!is.null(eng_wg)) { vocab_quantiles <- fit_vocab_quantiles(eng_wg, production) vocab_quantiles_sex <- fit_vocab_quantiles(eng_wg, production, sex) vocab_quartiles <- fit_vocab_quantiles(eng_wg, production, quantiles = "quartiles") }
Get the Wordbank by-administration data
get_administration_data( language = NULL, form = NULL, filter_age = TRUE, include_demographic_info = FALSE, include_birth_info = FALSE, include_health_conditions = FALSE, include_language_exposure = FALSE, include_study_internal_id = FALSE, db_args = NULL )
get_administration_data( language = NULL, form = NULL, filter_age = TRUE, include_demographic_info = FALSE, include_birth_info = FALSE, include_health_conditions = FALSE, include_language_exposure = FALSE, include_study_internal_id = FALSE, db_args = NULL )
language |
An optional string specifying which language's administrations to retrieve. |
form |
An optional string specifying which form's administrations to retrieve. |
filter_age |
A logical indicating whether to filter the administrations to ones in the valid age range for their instrument. |
include_demographic_info |
A logical indicating whether to include the
child's demographic information ( |
include_birth_info |
A logical indicating whether to include the child's
birth information ( |
include_health_conditions |
A logical indicating whether to include the
child's health condition information (a nested dataframe under
|
include_language_exposure |
A logical indicating whether to include the
child's language exposure information at time of administration (a nested
dataframe under |
include_study_internal_id |
A logical indicating whether to include the child's ID in the original study data. |
db_args |
List with arguments to connect to wordbank mysql database (host, dbname, user, and password). |
A data frame where each row is a CDI administration and each column
is a variable about the administration (data_id
,
date_of_test
, age
, comprehension
, production
,
is_norming
), the dataset it's from (dataset_name
,
dataset_origin_name
, language
, form
,
form_type
), and information about the child as described in the
parameter specification.
english_ws_admins <- get_administration_data("English (American)", "WS") all_admins <- get_administration_data()
english_ws_admins <- get_administration_data("English (American)", "WS") all_admins <- get_administration_data()
Get item-by-age summary statistics for items across languages
get_crossling_data(uni_lemmas, db_args = NULL)
get_crossling_data(uni_lemmas, db_args = NULL)
uni_lemmas |
A character vector of uni_lemmas. |
db_args |
List with arguments to connect to wordbank mysql database (host, dbname, user, and password). |
A dataframe with a row for each combination of language, item, and
age, and columns for summary statistics for the group: number of children
(n_children
), means (comprehension
, production
),
standard deviations (comprehension_sd
, production_sd
); and
item-level variables (item_id
, definition
, uni_lemma
,
lexical_category
, lexical_class
).
crossling_data <- get_crossling_data(uni_lemmas = "dog")
crossling_data <- get_crossling_data(uni_lemmas = "dog")
Get the uni_lemmas available in Wordbank
get_crossling_items(db_args = NULL)
get_crossling_items(db_args = NULL)
db_args |
List with arguments to connect to wordbank mysql database (host, dbname, user, and password). |
A data frame with the column uni_lemma
.
uni_lemmas <- get_crossling_items()
uni_lemmas <- get_crossling_items()
Get the Wordbank data sources
get_datasets(language = NULL, form = NULL, admin_data = FALSE, db_args = NULL)
get_datasets(language = NULL, form = NULL, admin_data = FALSE, db_args = NULL)
language |
An optional string specifying which language's datasets to retrieve. |
form |
An optional string specifying which form's datasets to retrieve. |
admin_data |
A logical indicating whether to include summary-level statistics on the administrations within a dataset. |
db_args |
List with arguments to connect to wordbank mysql database (host, dbname, user, and password). |
A data frame where each row is a particular dataset and its
characteristics: dataset_id
, dataset_name
,
dataset_origin_name
(unique identifier for groups of datasets that
may share children), language
, form
, form_type
,
contributor
(contributor name and affiliated institution),
citation
, license
, longitudinal
(whether dataset
includes longitudinal participants). Also includes summary statistics on a
dataset if the admin_data
flag is TRUE
: number of
administrations (n_admins
).
english_ws_datasets <- get_datasets(language = "English (American)", form = "WS", admin_data = TRUE)
english_ws_datasets <- get_datasets(language = "English (American)", form = "WS", admin_data = TRUE)
Get the Wordbank administration-by-item data
get_instrument_data( language, form, items = NULL, administration_info = FALSE, item_info = FALSE, db_args = NULL, ... )
get_instrument_data( language, form, items = NULL, administration_info = FALSE, item_info = FALSE, db_args = NULL, ... )
language |
A string of the instrument's language (insensitive to case and whitespace). |
form |
A string of the instrument's form (insensitive to case and whitespace). |
items |
A character vector of column names of |
administration_info |
Either a logical indicating whether to include
administration data or a data frame of administration data (as returned by
|
item_info |
Either a logical indicating whether to include item data or
a data frame of item data (as returned by |
db_args |
List with arguments to connect to wordbank mysql database (host, dbname, user, and password). |
... |
<['dynamic-dots'][rlang::dyn-dots]> Arguments passed to
|
A data frame where each row contains the values (value
,
produces
, understands
) of a given item (item_id
) for a
given administration (data_id
), with additional columns of variables
about the administration and item, as specified.
eng_ws_data <- get_instrument_data(language = "English (American)", form = "WS", items = c("item_1", "item_42"), item_info = TRUE)
eng_ws_data <- get_instrument_data(language = "English (American)", form = "WS", items = c("item_1", "item_42"), item_info = TRUE)
Get the Wordbank instruments
get_instruments(db_args = NULL)
get_instruments(db_args = NULL)
db_args |
List with arguments to connect to wordbank mysql database (host, dbname, user, and password). |
A data frame where each row is a CDI instrument and each column is a
variable about the instrument (instrument_id
, language
,
form
, age_min
, age_max
, has_grammar
).
instruments <- get_instruments()
instruments <- get_instruments()
Get the Wordbank by-item data
get_item_data(language = NULL, form = NULL, db_args = NULL)
get_item_data(language = NULL, form = NULL, db_args = NULL)
language |
An optional string specifying which language's items to retrieve. |
form |
An optional string specifying which form's items to retrieve. |
db_args |
List with arguments to connect to wordbank mysql database (host, dbname, user, and password). |
A data frame where each row is a CDI item and each column is a
variable about it: item_id
, item_kind
(e.g. word, gestures,
word_endings), item_definition
, english_gloss
,
language
, form
, form_type
, category
(meaning-based group as shown on the CDI form), lexical_category
,
lexical_class
, complexity_category
, uni_lemma
).
english_ws_items <- get_item_data("English (American)", "WS") all_items <- get_item_data()
english_ws_items <- get_item_data("English (American)", "WS") all_items <- get_item_data()
Get database connection arguments
get_wordbank_args()
get_wordbank_args()
List of database connection arguments: host, db_name, username, password
get_wordbank_args()
get_wordbank_args()
Get item-by-age summary statistics
summarise_items(item_data, db_args = NULL)
summarise_items(item_data, db_args = NULL)
item_data |
A dataframe as returned by |
db_args |
List with arguments to connect to wordbank mysql database (host, dbname, user, and password). |
A dataframe with a row for each combination of item and age, and
columns for summary statistics for the group: number of children
(n_children
), means (comprehension
, production
),
standard deviations (comprehension_sd
, production_sd
); also
retains item-level variables from lang_items
(item_id
,
item_definition
, uni_lemma
, lexical_category
).
italian_items <- get_item_data(language = "Italian", form = "WG") if (!is.null(italian_items)) { italian_dog <- dplyr::filter(italian_items, uni_lemma == "dog") italian_dog_summary <- summarise_items(italian_dog) }
italian_items <- get_item_data(language = "Italian", form = "WG") if (!is.null(italian_items)) { italian_dog <- dplyr::filter(italian_items, uni_lemma == "dog") italian_dog_summary <- summarise_items(italian_dog) }