% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/tidying.R
\name{tidying}
\alias{tidying}
\alias{separate_}
\alias{unite_}
\alias{fill_}
\alias{drop_na_}
\alias{replace_na_}
\alias{uncount_}
\title{Tidying Functions}
\usage{
separate_(
  .data = (.),
  col,
  into,
  sep = "[^[:alnum:]]+",
  remove = TRUE,
  convert = FALSE,
  extra = "warn",
  fill = "warn",
  fixed = FALSE,
  ...
)

unite_(.data = (.), col, ..., sep = "_", remove = TRUE, na.rm = FALSE)

fill_(.data = (.), ..., .direction = "down")

drop_na_(.data = (.), ..., .na.attr = FALSE, .prop = 0)

replace_na_(.data = (.), replace, ..., v = NULL)

uncount_(.data = (.), weights, ..., .remove = TRUE, .id = NULL)
}
\arguments{
\item{.data}{A data frame (data.frame, data.table, or tibble)}

\item{col}{For \code{separate_()}: the column to separate. Can be a column name
as character, or a formula (e.g., \code{~col_name}). For \code{unite_()}: the name
of the new united column (character string or formula).}

\item{into}{For \code{separate_()}: names of new variables to create as a character
vector. Use \code{NA} to omit a variable in the output.}

\item{sep}{For \code{separate_()} and \code{unite_()}: separator between columns. For
\code{separate_()}, can be a character vector, a numeric vector of positions to
split at, or a regular expression pattern. Default is \code{"[^[:alnum:]]+"} for
\code{separate_()} and \code{"_"} for \code{unite_()}.}

\item{remove}{Logical. If \code{TRUE} (default), remove input columns from output.}

\item{convert}{For \code{separate_()}: logical. If \code{TRUE}, attempts to convert
new columns to appropriate types. Default is \code{FALSE}.}

\item{extra}{For \code{separate_()} when \code{sep} is a character: what to do when
there are too many pieces. Options: \code{"warn"} (default, warn and drop),
\code{"drop"} (drop without warning), or \code{"merge"} (merge extra pieces with last).}

\item{fill}{For \code{separate_()} when \code{sep} is a character: what to do when
there are not enough pieces. Options: \code{"warn"} (default, warn and fill right
with NA), \code{"right"} (fill right without warning), or \code{"left"} (fill left).}

\item{fixed}{For \code{separate_()}: logical. If \code{TRUE}, \code{sep} is a fixed string.
If \code{FALSE} (default), \code{sep} is a (perl) regular expression.}

\item{...}{For \code{separate_()} and \code{unite_()}: additional arguments (currently
unused). For \code{fill_()} and \code{drop_na_()}: columns to fill or check for NAs.
Use formulas (e.g., \code{~col1}, \code{~col2}) or column names. If not provided,
uses all columns.}

\item{na.rm}{If \code{TRUE}, \code{NA}s are eliminated before uniting the values.}

\item{.direction}{Direction in which to fill missing data: \code{"down"} (by
default), \code{"up"}, or \code{"downup"} (first down, then up), \code{"updown"}
(the opposite).}

\item{.na.attr}{logical. \code{TRUE} adds an attribute containing the removed
cases. For compatibility reasons this is exactly the same format as
\code{\link[=na.omit]{na.omit()}}, i.e. the attribute is called "na.action" and of class \strong{omit}}

\item{.prop}{numeric. The proportion missing values in each case for the case
to be considered as missing required to keep a}

\item{replace}{If \code{.data} is a vector, a unique value to replace \code{NA}s,
otherwise, a list of values, one per column of the data frame.}

\item{v}{a vector where to replace NAs.}

\item{weights}{A vector of weight to use to "uncount" \code{.data}.}

\item{.remove}{If \code{TRUE} (default), and \code{weights} is the name of a column,
that column is removed from \code{.data}.}

\item{.id}{The name of the column for the origin id, either names if all
other arguments are named, or numbers.}
}
\value{
A data frame of the same type as \code{.data} with the transformation applied.
\itemize{
\item \code{separate_()} returns a data frame with the specified column split into
multiple columns
\item \code{unite_()} returns a data frame with specified columns combined into one
\item \code{fill_()} returns a data frame with missing values filled
\item \code{drop_na_()} returns a data frame with rows containing NAs removed
\item \code{replace_na_()} returns a data frame or vector with NAs replaced by specified values
\item \code{uncount_()} returns a data frame with rows duplicated according to weights
}
}
\description{
Functions for tidying data by separating, uniting, filling, and handling missing values.

These are SciViews::R versions of tidyverse functions with standard
evaluation and formula-based non-standard evaluation (ending with underscore
\verb{_}). They work with data.frame, data.table, and tibbles.

\strong{Functions:}
\itemize{
\item \code{separate_()} - Separate one column into multiple columns by splitting on a separator
\item \code{unite_()} - Unite multiple columns into one by pasting strings together
\item \code{fill_()} - Fill missing values using previous or next non-missing value
\item \code{drop_na_()} - Drop rows containing missing values
\item \code{replace_na_()} - Replace missing values with a specified value
\item \code{uncount_()} - Duplicate rows according to a weighting variable
}
}
\examples{
library(svTidy)

# separate_() - split one column into multiple
df <- data.frame(x = c("a_b_c", "d_e_f", "g_h_i"))
df |> separate_(~x, into = c("A", "B", "C"), sep = "_")

# Use character name instead of formula
df |> separate_("x", into = c("A", "B", "C"), sep = "_")

# Drop a column with NA in into
df |> separate_(~x, into = c("A", NA, "C"), sep = "_")

# Keep original column
df |> separate_(~x, into = c("A", "B", "C"), sep = "_", remove = FALSE)

# Separate by numeric positions is not implemented yet
#df2 <- data.frame(date = c("20201231", "20210115", "20220601"))
#df2 |> separate_(~date, into = c("year", "month", "day"), sep = c(4, 6))

# Handle too many pieces
df3 <- data.frame(x = c("a_b_c", "d_e_f_g", "h_i"))
df3 |> separate_(~x, into = c("A", "B"), extra = "drop")
df3 |> separate_(~x, into = c("A", "B"), extra = "merge")

# Handle too few pieces
df3 |> separate_(~x, into = c("A", "B", "C"), fill = "right")

# unite_() - combine multiple columns into one
df4 <- data.frame(year = 2020:2022, month = 1:3, day = 10:12)
df4 |> unite_(~date, ~year, ~month, ~day, sep = "-")

# Keep original columns
df4 |> unite_(~date, ~year, ~month, ~day, sep = "-", remove = FALSE)

# Handle NAs in unite
df5 <- data.frame(x = c("a", "b", NA), y = c("d", NA, "f"))
df5 |> unite_(~z, ~x, ~y)
df5 |> unite_(~z, ~x, ~y, na.rm = TRUE)

# fill_() - fill missing values
df6 <- data.frame(
  group = c(1, 1, 1, 2, 2, 2),
  value = c(10, NA, NA, 20, NA, 30)
)
df6 |> fill_(~value)

# Fill upward
df6 |> fill_(~value, .direction = "up")

# Fill down then up
df6 |> fill_(~value, .direction = "downup")

# Fill specific columns
df7 <- data.frame(x = c(1, NA, 3), y = c(NA, 2, NA), z = c(1, 2, 3))
df7 |> fill_(~x, ~y, .direction = "down")

# Fill with grouped data
df6 |>
  group_by_(~group) |>
  fill_(~value)

# drop_na_() - remove rows with missing values
df8 <- data.frame(x = c(1, 2, NA), y = c("a", NA, "c"), z = 1:3)
df8 |> drop_na_()

# Drop NAs from specific columns only
df8 |> drop_na_(~x)
df8 |> drop_na_(~x, ~y)

# Use proportion threshold
df9 <- data.frame(x = c(1, NA, NA), y = c(NA, 2, NA), z = c(NA, NA, 3))
df9 |> drop_na_(.prop = 0.5)  # Drop rows with >= 50\% NAs

# Keep track of removed rows
result <- df8 |> drop_na_(.na.attr = TRUE)
attr(result, "na.action")

# replace_na_() - replace NAs with a value
df10 <- data.frame(x = c(1, 2, NA), y = c(NA, "b", "c"))
df10 |> replace_na_(list(x = 0, y = "missing"))

# Replace in a single vector
vec <- c(1, 2, NA, 4, NA)
replace_na_(v = vec, replace = 0)

# Replace all NAs with same value (not standard tidyr)
df10 |> replace_na_(list(everywhere = 999))

# uncount_() - duplicate rows according to weights
df11 <- data.frame(x = c("a", "b", "c"), n = c(1, 2, 3))
df11 |> uncount_(~n)

# Keep the weight column
df11 |> uncount_(~n, .remove = FALSE)

# Add ID column to track original rows
df11 |> uncount_(~n, .id = "id")

# Use numeric weights vector
df12 <- data.frame(x = c("a", "b", "c"))
df12 |> uncount_(weights = c(2, 1, 3))

}
\seealso{
\code{\link[tidyr:separate]{tidyr::separate()}}, \code{\link[tidyr:unite]{tidyr::unite()}}, \code{\link[tidyr:fill]{tidyr::fill()}}, \code{\link[tidyr:drop_na]{tidyr::drop_na()}},
\code{\link[tidyr:replace_na]{tidyr::replace_na()}}, \code{\link[tidyr:uncount]{tidyr::uncount()}}, \code{\link[collapse:efficient-programming]{collapse::na_omit()}},
\code{\link[collapse:recode-replace]{collapse::replace_na()}}
}
