easystats · rempsyc · Nov 11, 2024 · Nov 11, 2024 · Nov 11, 2024 · Nov 11, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.13.0.12
+Version: 0.13.0.13
 Authors@R: c(
     person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -267,6 +267,7 @@ export(data_write)
 export(degroup)
 export(demean)
 export(describe_distribution)
+export(describe_missing)
 export(detrend)
 export(distribution_coef_var)
 export(distribution_mode)

diff --git a/NEWS.md b/NEWS.md
@@ -5,6 +5,10 @@ BREAKING CHANGES
 * Argument `drop_na` in `data_match()` is deprecated now. Please use `remove_na`
   instead.
 
+NEW FUNCTIONS
+
+* `describe_missing()`, to comprehensively report on missing values in a data frame.
+
 CHANGES
 
 * The `select` argument, which is available in different functions to select

diff --git a/R/describe_missing.R b/R/describe_missing.R
@@ -0,0 +1,115 @@
+#' @title Describe Missing Values in Data According to Guidelines
+#'
+#' @description Provides a detailed description of missing values in a data frame.
+#' This function reports both absolute and percentage missing values of specified
+#' column lists or scales, following recommended guidelines. Some authors recommend
+#' reporting item-level missingness per scale, as well as a participant's maximum
+#' number of missing items by scale. For example, Parent (2013) writes:
+#'
+#' *I recommend that authors (a) state their tolerance level for missing data by scale
+#' or subscale (e.g., "We calculated means for all subscales on which participants gave
+#' at least 75% complete data") and then (b) report the individual missingness rates
+#' by scale per data point (i.e., the number of missing values out of all data points
+#' on that scale for all participants) and the maximum by participant (e.g., "For Attachment
+#' Anxiety, a total of 4 missing data points out of 100 were observed, with no participant
+#' missing more than a single data point").*
+#'
+#' @param data The data frame to be analyzed.
+#' @param vars Variable (or lists of variables) to check for missing values (NAs).
+#' @param scales The scale names to check for missing values (as a character vector).
+#' @keywords missing values NA guidelines
+#' @return A dataframe with the following columns:
+#'  - `var`: Variables selected.
+#'  - `items`: Number of items for selected variables.
+#'  - `na`: Number of missing cell values for those variables (e.g., 2 missing
+#'  values for the first participant + 2 missing values for the second participant
+#'  = total of 4 missing values).
+#'  - `cells`: Total number of cells (i.e., number of participants multiplied by
+#'  the number of variables, `items`).
+#'  - `na_percent`: The percentage of missing values (`na` divided by `cells`).
+#'  - `na_max`: The number of missing values for the participant with the most
+#'  missing values for the selected variables.
+#'  - `na_max_percent`: The amount of missing values for the participant with
+#'  the most missing values for the selected variables, as a percentage
+#'  (i.e., `na_max` divided by the number of selected variables, `items`).
+#'  - `all_na`: The number of participants missing 100% of items for that scale
+#'  (the selected variables).
+#'
+#' @export
+#' @references Parent, M. C. (2013). Handling item-level missing
+#' data: Simpler is just as good. *The Counseling Psychologist*,
+#' *41*(4), 568-600. https://doi.org/10.1177%2F0011000012445176
+#' @examples
+#' # Use the entire data frame
+#' describe_missing(airquality)
+#'
+#' # Use selected columns explicitly
+#' describe_missing(airquality,
+#'   vars = list(
+#'     c("Ozone", "Solar.R", "Wind"),
+#'     c("Temp", "Month", "Day")
+#'   )
+#' )
+#'
+#' # If the questionnaire items start with the same name, e.g.,
+#' set.seed(15)
+#' fun <- function() {
+#'   c(sample(c(NA, 1:10), replace = TRUE), NA, NA, NA)
+#' }
+#' df <- data.frame(
+#'   ID = c("idz", NA),
+#'   open_1 = fun(), open_2 = fun(), open_3 = fun(),
+#'   extrovert_1 = fun(), extrovert_2 = fun(), extrovert_3 = fun(),
+#'   agreeable_1 = fun(), agreeable_2 = fun(), agreeable_3 = fun()
+#' )
+#'
+#' # One can list the scale names directly:
+#' describe_missing(df, scales = c("ID", "open", "extrovert", "agreeable"))
+describe_missing <- function(data, vars = NULL, scales = NULL) {
+  classes <- lapply(data, class)
+  if (missing(vars) && missing(scales)) {
+    vars.internal <- names(data)
+  } else if (!missing(scales)) {
+    vars.internal <- lapply(scales, function(x) {
+      grep(paste0("^", x), names(data), value = TRUE)
+    })
+  }
+  if (!missing(vars)) {
+    vars.internal <- vars
+  }
+  if (!is.list(vars.internal)) {
+    vars.internal <- list(vars.internal)
+  }
+  na_df <- .describe_missing(data)
+  if (!missing(vars) || !missing(scales)) {
+    na_list <- lapply(vars.internal, function(x) {
+      data_subset <- data[, x, drop = FALSE]
+      .describe_missing(data_subset)
+    })
+    na_df$var <- "Total"
+    na_df <- do.call(rbind, c(na_list, list(na_df)))
+  }
+  na_df
+}
+
+.describe_missing <- function(data) {
+  my_var <- paste0(names(data)[1], ":", names(data)[ncol(data)])
+  items <- ncol(data)
+  na <- sum(is.na(data))
+  cells <- nrow(data) * ncol(data)
+  na_percent <- round(na / cells * 100, 2)
+  na_max <- max(rowSums(is.na(data)))
+  na_max_percent <- round(na_max / items * 100, 2)
+  all_na <- sum(apply(data, 1, function(x) all(is.na(x))))
+
+  data.frame(
+    var = my_var,
+    items = items,
+    na = na,
+    cells = cells,
+    na_percent = na_percent,
+    na_max = na_max,
+    na_max_percent = na_max_percent,
+    all_na = all_na
+  )
+}
diff --git a/inst/WORDLIST b/inst/WORDLIST
@@ -8,14 +8,13 @@ CMD
 Carle
 Catran
 Crosstables
-Dhaliwal
-Disaggregating
 DOI
 De
-Dom
+Dhaliwal
+Disaggregating
 EFC
-Enders
 EUROFAMCARE
+Enders
 Fairbrother
 GLMM
 Gelman
@@ -54,7 +53,6 @@ Winsorizing
 al
 behaviour
 behaviours
-bmwiernik
 codebook
 codebooks
 coercible
@@ -77,7 +75,6 @@ joss
 labelled
 labelling
 leptokurtic
-lifecycle
 lm
 lme
 meaned
@@ -88,7 +85,6 @@ modelling
 nd
 panelr
 partialization
-patilindrajeets
 platykurtic
 poorman
 pre
@@ -102,7 +98,6 @@ recodes
 recoding
 recodings
 relevel
-rempsyc
 reproducibility
 rescale
 rescaled
@@ -111,7 +106,8 @@ rio
 rowid
 sd
 stackexchange
-strengejacke
+subscale
+subscales
 tailedness
 th
 tibble

diff --git a/man/describe_missing.Rd b/man/describe_missing.Rd
diff --git a/pkgdown/_pkgdown.yaml b/pkgdown/_pkgdown.yaml
@@ -66,6 +66,7 @@ reference:
       - data_tabulate
       - data_peek
       - data_seek
+      - describe_missing
       - means_by_group
       - contains("distribution")
       - kurtosis