diff --git a/NAMESPACE b/NAMESPACE index d91f16c9..cf94e3bd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,7 +4,6 @@ export(GCA2Lineage) export(IPG2Lineage) export(acc2FA) export(acc2Lineage) -export(acc2fa) export(addLeaves2Alignment) export(addLineage) export(addName) diff --git a/R/CHANGED-pre-msa-tree.R b/R/CHANGED-pre-msa-tree.R index 2f6c8a62..48d1abf9 100644 --- a/R/CHANGED-pre-msa-tree.R +++ b/R/CHANGED-pre-msa-tree.R @@ -40,10 +40,14 @@ api_key <- Sys.getenv("ENTREZ_API_KEY", unset = "YOUR_KEY_HERE") #' @param y Delimitter. Default is space (" "). #' @seealso chartr, toupper, and tolower. #' -#' @return +#' @return Character vector with the input strings converted to title case. +#' #' @export #' #' @examples +#' # Convert a single string to title case +#' convert2TitleCase("hello world") # Returns "Hello World" +#' convert2TitleCase <- function(x, y = " ") { s <- strsplit(x, y)[[1]] paste(toupper(substring(s, 1, 1)), substring(s, 2), @@ -76,7 +80,8 @@ convert2TitleCase <- function(x, y = " ") { #' @importFrom stringr str_sub #' @importFrom tidyr replace_na separate #' -#' @return +#' @return A data frame containing the enriched alignment data with lineage +#' information. #' #' @details The alignment file would need two columns: 1. accession + #' number and 2. alignment. The protein homolog accession to lineage mapping + @@ -203,6 +208,14 @@ addLeaves2Alignment <- function(aln_file = "", #' @export #' #' @examples +#' # Example usage of the addName function +#' data <- data.frame( +#' AccNum = c("ACC123", "ACC456"), +#' Species = c("Homo sapiens", "Mus musculus"), +#' Lineage = c("Eukaryota>Chordata", "Eukaryota>Chordata") +#' ) +#' enriched_data <- addName(data) +#' enriched_data addName <- function(data, accnum_col = "AccNum", spec_col = "Species", lin_col = "Lineage", lin_sep = ">", out_col = "Name") { @@ -278,7 +291,9 @@ addName <- function(data, #' @note Please refer to the source code if you have alternate + #' file formats and/or column names. #' -#' @return +#' @return A character string representing the FASTA formatted sequences. +#' If `fa_outpath` is provided, the FASTA will also be saved to the specified +#' file. #' @export #' #' @examples @@ -321,23 +336,29 @@ convertAlignment2FA <- function(aln_file = "", } #' mapAcc2Name -#' +#' #' @description #' Default renameFA() replacement function. Maps an accession number to its name #' #' @param line The line of a fasta file starting with '>' -#' @param acc2name Data Table containing a column of accession numbers and a name column +#' @param acc2name Data Table containing a column of accession numbers and a +#' name column #' @param acc_col Name of the column containing Accession numbers -#' @param name_col Name of the column containing the names that the accession numbers +#' @param name_col Name of the column containing the names that the accession +#' numbers #' are mapped to #' #' @importFrom dplyr filter pull #' @importFrom rlang sym #' -#' @return +#' @return A character string representing the updated FASTA line, where the +#' accession number is replaced with its corresponding name. #' @export #' #' @examples +#' \dontrun{ +#' mapAcc2Name(">P12345 some description", acc2name, "AccNum", "Name") +#' } mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") { # change to be the name equivalent to an addNames column # Find the first ' ' @@ -363,10 +384,14 @@ mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") { #' @importFrom purrr map #' @importFrom readr read_lines write_lines #' -#' @return +#' @return A character vector of the modified lines in the FASTA file. #' @export #' #' @examples +#' \dontrun{ +#' renameFA("path/to/input.fasta", +#' "path/to/output.fasta", mapAcc2Name, acc2name) +#' } renameFA <- function(fa_path, outpath, replacement_function = mapAcc2Name, ...) { lines <- read_lines(fa_path) @@ -386,8 +411,8 @@ renameFA <- function(fa_path, outpath, ################################ ## generateAllAlignments2FA #' generateAllAlignments2FA -#' -#' @description +#' +#' @description #' Adding Leaves to an alignment file w/ accessions #' #' @keywords alignment, accnum, leaves, lineage, species @@ -395,20 +420,26 @@ renameFA <- function(fa_path, outpath, #' #' @param aln_path Character. Path to alignment files. #' Default is 'here("data/rawdata_aln/")' -#' @param fa_outpath Character. Path to file. Master protein file with AccNum & lineages. +#' @param fa_outpath Character. Path to file. Master protein file with AccNum & +#' lineages. #' Default is 'here("data/rawdata_tsv/all_semiclean.txt")' #' @param lin_file Character. Path to the written fasta file. #' Default is 'here("data/alns/")'. -#' @param reduced Boolean. If TRUE, the fasta file will contain only one sequence per lineage. +#' @param reduced Boolean. If TRUE, the fasta file will contain only one +#' sequence per lineage. #' Default is 'FALSE'. #' #' @importFrom purrr pmap #' @importFrom stringr str_replace_all #' -#' @return +#' @return NULL. The function saves the output FASTA files to the specified +#' directory. #' -#' @details The alignment files would need two columns separated by spaces: 1. AccNum and 2. alignment. The protein homolog file should have AccNum, Species, Lineages. -#' @note Please refer to the source code if you have alternate + file formats and/or column names. +#' @details The alignment files would need two columns separated by spaces: +#' 1. AccNum and 2. alignment. The protein homolog file should have AccNum, +#' Species, Lineages. +#' @note Please refer to the source code if you have alternate + file formats +#' and/or column names. #' #' @export #' @@ -447,33 +478,38 @@ generateAllAlignments2FA <- function(aln_path = here("data/rawdata_aln/"), # accessions <- c("P12345","Q9UHC1","O15530","Q14624","P0DTD1") # accessions <- rep("ANY95992.1", 201) -#' acc2FA +#' acc2FA #' #' @description -#' converts protein accession numbers to a fasta format. Resulting +#' converts protein accession numbers to a fasta format. Resulting #' fasta file is written to the outpath. #' #' @author Samuel Chen, Janani Ravi #' @keywords accnum, fasta #' -#' @param accessions Character vector containing protein accession numbers to generate fasta sequences for. +#' @param accessions Character vector containing protein accession numbers to +#' generate fasta sequences for. #' Function may not work for vectors of length > 10,000 #' @param outpath [str] Location where fasta file should be written to. -#' @param plan +#' @param plan Character string specifying the parallel processing strategy to +#' use with the `future` package. Default is "sequential". #' #' @importFrom Biostrings readAAStringSet #' @importFrom future future plan value #' @importFrom purrr map #' @importFrom rentrez entrez_fetch #' -#' @return +#' @return A logical value indicating whether the retrieval and conversion were +#' successful. Returns `TRUE` if successful and `FALSE` otherwise. #' @export #' #' @examples #' \dontrun{ -#' acc2FA(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"), outpath = "my_proteins.fasta") +#' acc2FA(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"), +#' outpath = "my_proteins.fasta") #' Entrez:accessions <- rep("ANY95992.1", 201) |> acc2FA(outpath = "entrez.fa") -#' EBI:accessions <- c("P12345", "Q9UHC1", "O15530", "Q14624", "P0DTD1") |> acc2FA(outpath = "ebi.fa") +#' EBI:accessions <- c("P12345", "Q9UHC1", +#' "O15530", "Q14624", "P0DTD1") |> acc2FA(outpath = "ebi.fa") #' } acc2FA <- function(accessions, outpath, plan = "sequential") { # validation @@ -547,9 +583,10 @@ acc2FA <- function(accessions, outpath, plan = "sequential") { } #' createRepresentativeAccNum -#' +#' #' @description -#' Function to generate a vector of one Accession number per distinct observation from 'reduced' column +#' Function to generate a vector of one Accession number per distinct +#' observation from 'reduced' column #' #' @author Samuel Chen, Janani Ravi #' @@ -562,14 +599,19 @@ acc2FA <- function(accessions, outpath, plan = "sequential") { #' @importFrom dplyr filter pull #' @importFrom rlang sym #' -#' @return +#' @return A character vector containing one Accession number per distinct +#' observation from the specified reduced column. #' @export #' #' @examples +#' \dontrun{ +#' createRepresentativeAccNum(prot) +#' } createRepresentativeAccNum <- function(prot_data, reduced = "Lineage", accnum_col = "AccNum") { - # Get Unique reduced column and then bind the AccNums back to get one AccNum per reduced column + # Get Unique reduced column and then bind the AccNums back to get one + # AccNum per reduced column reduced_sym <- sym(reduced) accnum_sym <- sym(accnum_col) @@ -596,15 +638,17 @@ createRepresentativeAccNum <- function(prot_data, } #' alignFasta -#' +#' #' @description #' Perform a Multiple Sequence Alignment on a FASTA file. #' #' @author Samuel Chen, Janani Ravi #' #' @param fasta_file Path to the FASTA file to be aligned -#' @param tool Type of alignment tool to use. One of three options: "Muscle", "ClustalO", or "ClustalW" -#' @param outpath Path to write the resulting alignment to as a FASTA file. If NULL, no file is written +#' @param tool Type of alignment tool to use. One of three options: "Muscle", +#' "ClustalO", or "ClustalW" +#' @param outpath Path to write the resulting alignment to as a FASTA file. +#' If NULL, no file is written #' #' @importFrom Biostrings readAAStringSet #' @importFrom msa msaClustalOmega msaMuscle msaClustalW @@ -613,6 +657,10 @@ createRepresentativeAccNum <- function(prot_data, #' @export #' #' @examples +#' \dontrun{ +#' aligned_sequences <- alignFasta("my_sequences.fasta", +#' tool = "Muscle", outpath = "aligned_output.fasta") +#' } alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) { fasta <- readAAStringSet(fasta_file) @@ -641,11 +689,14 @@ alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) { #' @importFrom Biostrings toString unmasked #' @importFrom readr write_file #' -#' @return +#' @return Character string representing the content of the written FASTA file. #' @export #' #' @examples -writeMSA_AA2FA <- function(alignment, outpath) { +#' \dontrun{ +#' writeMSA_AA2FA("my_sequences.fasta", outpath = "aligned_output.fasta") +#' } +writeMSA_AA2FA <- function(writeMSA_AA2FA, outpath) { l <- length(rownames(alignment)) fasta <- "" for (i in 1:l) @@ -660,14 +711,18 @@ writeMSA_AA2FA <- function(alignment, outpath) { #' getAccNumFromFA #' -#' @param fasta_file +#' @param fasta_file Character. The path to the FASTA file from which +#' accession numbers will be extracted. #' #' @importFrom stringi stri_extract_all_regex #' -#' @return +#' @return A character vector containing the extracted accession numbers. #' @export #' #' @examples +#' \dontrun{ +#' getAccNumFromFA("my_sequences.fasta") +#' } getAccNumFromFA <- function(fasta_file) { txt <- read_file(fasta_file) accnums <- stringi::stri_extract_all_regex(fasta_file, "(?<=>)[\\w,.]+")[[1]] diff --git a/R/blastWrappers.R b/R/blastWrappers.R index 48753afa..3c9c4192 100755 --- a/R/blastWrappers.R +++ b/R/blastWrappers.R @@ -3,21 +3,29 @@ #' Run DELTABLAST to find homologs for proteins of interest #' #' @author Samuel Chen, Janani Ravi +#' @description +#' This function executes a Delta-BLAST search using the specified parameters +#' and database. It sets the BLAST database path, runs the Delta-BLAST command +#' with the given query, and outputs the results. #' -#' @param deltablast_path -#' @param db_search_path Path to the BLAST databases -#' @param db -#' @param query -#' @param evalue -#' @param out -#' @param num_alignments -#' @param num_threads +#' @param deltablast_path Path to the Delta-BLAST executable. +#' @param db_search_path Path to the BLAST databases. +#' @param db Name of the BLAST database to search against (default is "refseq"). +#' @param query Path to the input query file. +#' @param evalue E-value threshold for reporting matches (default is "1e-5"). +#' @param out Path to the output file where results will be saved. +#' @param num_alignments Number of alignments to report. +#' @param num_threads Number of threads to use for the search (default is 1). #' -#' @return +#' @return This function does not return a value; it outputs results to the +#' specified file. #' @export #' #' @examples -runDeltaBlast <- function(deltablast_path, db_search_path, +#' \dontrun{ +#' runDeltaBlast(runDeltaBlast, db_search_path) +#' } +runDeltaBlast <- function(runDeltaBlast, db_search_path, db = "refseq", query, evalue = "1e-5", out, num_alignments, num_threads = 1) { start <- Sys.time() @@ -43,18 +51,27 @@ runDeltaBlast <- function(deltablast_path, db_search_path, #' Run RPSBLAST to generate domain architectures for proteins of interest #' -#' @param rpsblast_path -#' @param db_search_path Path to the BLAST databases -#' @param db -#' @param query -#' @param evalue -#' @param out -#' @param num_threads +#' @description +#' This function executes an RPS-BLAST search to generate domain architectures +#' for specified proteins. It sets the BLAST database path, runs the RPS-BLAST +#' command with the provided query, and outputs the results. #' -#' @return +#' @param rpsblast_path Path to the RPS-BLAST executable. +#' @param db_search_path Path to the BLAST databases. +#' @param db Name of the BLAST database to search against (default is "refseq"). +#' @param query Path to the input query file. +#' @param evalue E-value threshold for reporting matches (default is "1e-5"). +#' @param out Path to the output file where results will be saved. +#' @param num_threads Number of threads to use for the search (default is 1). +#' +#' @return This function does not return a value; it outputs results to the +#' specified file. #' @export #' #' @examples +#' \dontrun{ +#' runRSPBlast(rpsblast_path, db_search_path, query, out) +#' } runRPSBlast <- function(rpsblast_path, db_search_path, db = "refseq", query, evalue = "1e-5", out, num_threads = 1) { diff --git a/R/cleanup.R b/R/cleanup.R index 4fe074ee..a8e79e33 100755 --- a/R/cleanup.R +++ b/R/cleanup.R @@ -46,7 +46,8 @@ cleanString <- function(string) { # get_sequences() function to extract accession numbers #' extractAccNum #' -#' @param string +#' @param string A string from which to extract the accession number. +#' The string may contain accession information delimited by `|` or spaces. #' #' @return Describe return, in detail #' @export @@ -103,7 +104,9 @@ ensureUniqAccNum <- function(accnums) { #' Parse accesion numbers from fasta and add a #' suffix of the ith occurence to handle duplicates #' -#' @param fasta +#' @param fasta An [XStringSet] object representing the sequences from a +#' FASTA file. The sequence names (headers) will be adjusted for uniqueness +#' and sanitized. #' #' @importFrom purrr map_chr #' @importFrom fs path_sanitize @@ -148,7 +151,8 @@ cleanFAHeaders <- function(fasta) { #' #' @importFrom dplyr as_tibble filter #' -#' @return Describe return, in detail +#' @return A tibble with rows removed where the specified column contains +#' `"-"`, `"NA"`, or an empty string. #' @export #' #' @examples @@ -183,7 +187,7 @@ removeEmptyRows <- function(prot, by_column = "DomArch") { #' @param by_column Column in which repeats are condensed to domain+domain -> domain(s). #' @param excluded_prots Vector of strings that condenseRepeatedDomains should not reduce to (s). Defaults to c() #' -#' @return Describe return, in detail +#' @return A data frame with condensed repeated domains in the specified column. #' @export #' #' @importFrom dplyr pull @@ -244,7 +248,9 @@ condenseRepeatedDomains <- function(prot, by_column = "DomArch", excluded_prots #' @param prot DataTable to operate on #' @param by_column Column to operate on #' -#' @return Describe return, in detail +#' @return The original data frame with the specified column updated. All +#' consecutive '?' characters will be replaced with 'X(s)', and individual '?' +#' characters will be replaced with 'X'. #' @export #' #' @importFrom dplyr pull @@ -273,19 +279,21 @@ replaceQuestionMarks <- function(prot, by_column = "GenContext") { } -#' Remove Astrk +#' Remove Asterisk #' #' @description #' Remove the asterisks from a column of data #' Used for removing * from GenContext columns #' -#' @param query_data -#' @param colname +#' @param query_data A data frame containing the data to be processed. +#' @param colname The name of the column from which asterisks should be removed. +#' Defaults to "GenContext". #' #' @importFrom purrr map #' @importFrom stringr str_remove_all #' -#' @return Describe return, in detail +#' @return The original data frame with asterisks removed from the specified +#' column. #' @export #' #' @examples @@ -315,7 +323,8 @@ removeAsterisks <- function(query_data, colname = "GenContext") { #' @param by_column Default column is 'DomArch'. Can also take 'ClustName', 'GenContext' as input. #' @param keep_domains Default is False Keeps tail entries that contain the query domains. #' -#' @return Describe return, in detail +#' @return The original data frame with singletons removed from the specified +#' column. #' @export #' #' @importFrom dplyr count filter group_by pull n summarize @@ -374,7 +383,7 @@ removeTails <- function(prot, by_column = "DomArch", #' #' @importFrom stringr coll str_replace_all #' -#' @return Describe return, in detail +#' @return The original data frame with Species cleaned. #' @export #' #' @examples @@ -504,25 +513,34 @@ cleanClusters <- function(prot, #' The original data frame is returned with the clean DomArchs column and the old domains in the DomArchs.old column. #' #' @param prot A data frame containing a 'DomArch' column -#' @param old -#' @param new +#' @param old The name of the original column containing domain architecture. +#' Defaults to "DomArch.orig". +#' @param new The name of the cleaned column to be created. Defaults to +#' "DomArch". #' @param domains_keep A data frame containing the domain names to be retained. -#' @param domains_rename A data frame containing the domain names to be replaced in a column 'old' and the +#' @param domains_rename A data frame containing the domain names to be replaced +#' in a column 'old' and the #' corresponding replacement values in a column 'new'. -#' @param condenseRepeatedDomains Boolean. If TRUE, repeated domains in 'DomArch' are condensed. Default is TRUE. -#' @param removeTails Boolean. If TRUE, 'ClustName' will be filtered based on domains to keep/remove. Default is FALSE. -#' @param removeEmptyRows Boolean. If TRUE, rows with empty/unnecessary values in 'DomArch' are removed. Default is FALSE. -#' @param domains_ignore A data frame containing the domain names to be removed in a column called 'domains' +#' @param condenseRepeatedDomains Boolean. If TRUE, repeated domains in +#' 'DomArch' are condensed. Default is TRUE. +#' @param removeTails Boolean. If TRUE, 'ClustName' will be filtered based on +#' domains to keep/remove. Default is FALSE. +#' @param removeEmptyRows Boolean. If TRUE, rows with empty/unnecessary values +#' in 'DomArch' are removed. Default is FALSE. +#' @param domains_ignore A data frame containing the domain names to be removed +#' in a column called 'domains' #' #' @importFrom dplyr pull #' @importFrom stringr coll str_replace_all #' -#' @return The original data frame is returned with the clean DomArchs column and the old domains in the DomArchs.old column. +#' @return The original data frame is returned with the clean DomArchs column +#' and the old domains in the DomArchs.old column. #' @export #' #' @examples #' \dontrun{ -#' cleanDomainArchitecture(prot, TRUE, FALSE, domains_keep, domains_rename, domains_ignore = NULL) +#' cleanDomainArchitecture(prot, TRUE, FALSE, +#' omains_keep, domains_rename, domains_ignore = NULL) #' } cleanDomainArchitecture <- function(prot, old = "DomArch.orig", new = "DomArch", domains_keep, domains_rename, @@ -658,8 +676,9 @@ cleanGenomicContext <- function(prot, domains_rename = data.frame("old" = charac #' Cleanup GeneDesc #' -#' @param prot -#' @param column +#' @param prot A data frame containing the gene descriptions. +#' @param column The name of the column from which gene descriptions are pulled +#' for cleanup. #' #' @return Return trailing period that occurs in GeneDesc column #' @export @@ -677,13 +696,16 @@ cleanGeneDescription <- function(prot, column) { #' Pick Longer Duplicate #' -#' @param prot -#' @param column +#' @param prot A data frame containing the data, with at least one column +#' named 'AccNum' for identification of duplicates. +#' @param column The name of the column from which the longest entry among +#' duplicates will be selected. #' #' @importFrom dplyr arrange filter group_by pull n select summarize #' @importFrom rlang sym #' -#' @return Describe return, in detail +#' @return A data frame containing only the longest entries among duplicates +#' based on the specified column. #' @export #' #' @examples @@ -728,10 +750,13 @@ selectLongestDuplicate <- function(prot, column) { #' Cleanup Lineage #' -#' @param prot -#' @param lins_rename +#' @param prot A data frame containing a 'Lineage' column that needs to be +#' cleaned up. +#' @param lins_rename A data frame with two columns: 'old' containing terms +#' to be replaced and 'new' containing the corresponding replacement terms. #' -#' @return Describe return, in detail +#' @return The original data frame with the 'Lineage' column updated based on +#' the provided replacements. #' @export #' #' @examples diff --git a/R/combine_analysis.R b/R/combine_analysis.R index 55e36925..efda14a8 100755 --- a/R/combine_analysis.R +++ b/R/combine_analysis.R @@ -8,15 +8,23 @@ #' Combining full_analysis files #' -#' @param inpath -#' @param ret +#' @param inpath Character. The path to the directory containing the +#' `.full_analysis.tsv` files to be combined. +#' @param ret Logical. If TRUE, the function will return the combined data frame. +#' Default is FALSE, meaning it will only write the file and not return the data. #' #' @importFrom readr write_tsv #' -#' @return +#' @return If `ret` is TRUE, a data frame containing the combined data from all +#' input files. If `ret` is FALSE, the function writes the combined data to a +#' TSV file named `cln_combined.tsv` in the specified directory and returns NULL. +#' #' @export #' #' @examples +#' \dontrun{ +#' combined_data <- combineFullAnalysis("path/to/full_analysis/files", ret = TRUE) +#' } combineFullAnalysis <- function(inpath, ret = FALSE) { ## Combining full_analysis files full_combnd <- combineFiles(inpath, @@ -35,15 +43,23 @@ combineFullAnalysis <- function(inpath, ret = FALSE) { #' Combining clean ipr files #' -#' @param inpath -#' @param ret +#' @param inpath Character. The path to the directory containing the +#' `.iprscan_cln.tsv` files to be combined. +#' @param ret Logical. If TRUE, the function will return the combined data frame. +#' Default is FALSE, meaning it will only write the file and not return the data. #' #' @importFrom readr write_tsv #' -#' @return +#' @return If `ret` is TRUE, a data frame containing the combined data from all +#' input files. If `ret` is FALSE, the function writes the combined data to a +#' TSV file named `ipr_combined.tsv` in the specified directory and returns NULL. +#' #' @export #' #' @examples +#' \dontrun{ +#' combineIPR <- combine_ipr("path/to/ipr/files", ret = TRUE) +#' } combineIPR <- function(inpath, ret = FALSE) { ## Combining clean ipr files ipr_combnd <- combineFiles(inpath, diff --git a/R/combine_files.R b/R/combine_files.R index 455ddd53..4f03b1d2 100755 --- a/R/combine_files.R +++ b/R/combine_files.R @@ -24,20 +24,30 @@ #' #' @author Janani Ravi #' -#' @param inpath String of 'master' path where the files reside (recursive=T) -#' @param pattern Character vector containing search pattern for files -#' @param delim -#' @param skip -#' @param col_names Takes logical T/F arguments OR column names vector; -#' usage similar to col_names parameter in `readr::read_delim` +#' @param inpath Character. The master directory path where the files reside. +#' The search is recursive (i.e., it will look in subdirectories as well). +#' @param pattern Character. A search pattern to identify files to be combined. +#' Default is "*full_analysis.tsv". +#' @param delim Character. The delimiter used in the input files. +#' Default is tab ("\t"). +#' @param skip Integer. The number of lines to skip at the beginning of each file. +#' Default is 0. +#' @param col_names Logical or character vector. If TRUE, the first row of each file +#' is treated as column names. Alternatively, a character vector can +#' be provided to specify custom column names. #' #' @importFrom purrr pmap_dfr #' @importFrom readr cols #' -#' @return +#' @return A data frame containing the combined contents of all matched files. +#' Each row will include a new column "ByFile" indicating the source file of the data. +#' #' @export #' #' @examples +#' \dontrun{ +#' combined_data <- combineFiles(inpath = "../molevol_data/project_data/phage_defense/") +#' } combineFiles <- function(inpath = c("../molevol_data/project_data/phage_defense/"), pattern = "*full_analysis.tsv", delim = "\t", skip = 0, diff --git a/R/create_lineage_lookup.R b/R/create_lineage_lookup.R index 2408c5e6..b33bd4b4 100644 --- a/R/create_lineage_lookup.R +++ b/R/create_lineage_lookup.R @@ -10,12 +10,12 @@ #' #' @author Samuel Chen #' -#' @param lineage_file Path to the rankedlineage.dmp file containing taxid's and their -#' corresponding taxonomic rank. rankedlineage.dmp can be downloaded at +#' @param lineage_file Path to the rankedlineage.dmp file containing taxid's +#' and their corresponding taxonomic rank. rankedlineage.dmp can be downloaded at #' https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/ #' @param outfile File the resulting lineage lookup table should be written to -#' @param taxonomic_rank The upperbound of taxonomic rank that the lineage includes. The lineaege will -#' include superkingdom>...>taxonomic_rank. +#' @param taxonomic_rank The upperbound of taxonomic rank that the lineage +#' includes. The lineaege will include superkingdom>...>taxonomic_rank. #' Choices include: "supperkingdom", "phylum", "class","order", "family", #' "genus", and "species" #' @@ -25,10 +25,17 @@ #' @importFrom stringr str_locate str_replace_all #' @importFrom tidyr unite #' -#' @return +#' @return A tibble containing the tax IDs and their respective lineages up to +#' the specified taxonomic rank, saved as a tab-separated file. +#' #' @export #' #' @examples +#' \dontrun{ +#' createLineageLookup(lineage_file = "data/rankedlineage.dmp", +#' outfile = "data/lineage_lookup.tsv", +#' taxonomic_rank = "family") +#' } createLineageLookup <- function(lineage_file = here("data/rankedlineage.dmp"), outfile, taxonomic_rank = "phylum") { .shortenNA <- function(Lineage) { diff --git a/R/fa2domain.R b/R/fa2domain.R index 6dc6f622..f53322ca 100644 --- a/R/fa2domain.R +++ b/R/fa2domain.R @@ -6,15 +6,28 @@ #' runIPRScan #' -#' @param filepath_fasta -#' @param filepath_out -#' @param appl +#' Run InterProScan on a given FASTA file and save the results to an +#' output file. +#' +#' @param filepath_fasta A string representing the path to the input FASTA file. +#' @param filepath_out A string representing the base path for the output file. +#' @param appl A character vector specifying the InterProScan applications to +#' use (e.g., "Pfam", "Gene3D"). Default is `c("Pfam", "Gene3D")`. #' #' @importFrom stringr str_glue #' -#' @return +#' @return A data frame containing the results from the InterProScan output +#' TSV file. #' #' @examples +#' \dontrun{ +#' results <- runIPRScan( +#' filepath_fasta = "path/to/your_fasta_file.fasta", +#' filepath_out = "path/to/output_file", +#' appl = c("Pfam", "Gene3D") +#' ) +#' results +#' } runIPRScan <- function( filepath_fasta, filepath_out, # do not inlucde file extension since ipr handles this diff --git a/R/ipr2viz.R b/R/ipr2viz.R index 9b625d4e..e582ab09 100644 --- a/R/ipr2viz.R +++ b/R/ipr2viz.R @@ -19,10 +19,17 @@ #' #' @importFrom ggplot2 element_blank element_line theme theme_grey #' -#' @return +#' @return A ggplot2 theme object. #' @export -#' #' @examples +#' library(ggplot2) +#' +#' # Create a sample plot using the custom theme +#' ggplot(mtcars, aes(x = wt, y = mpg)) + +#' geom_point() + +#' themeGenes2() + +#' labs(title = "Car Weight vs MPG") +#' themeGenes2 <- function() { ggplot2::theme_grey() + ggplot2::theme( panel.background = ggplot2::element_blank(), @@ -44,11 +51,16 @@ themeGenes2 <- function() { #' getTopAccByLinDomArch #' @description Group by lineage + DA then take top 20 #' -#' @param infile_full -#' @param DA_col -#' @param lin_col -#' @param n -#' @param query +#' @param infile_full A data frame containing the full dataset with lineage and +#' domain architecture information. +#' @param DA_col A string representing the name of the domain architecture +#' column. Default is "DomArch.Pfam". +#' @param lin_col A string representing the name of the lineage column. +#' Default is "Lineage_short". +#' @param n An integer specifying the number of top accession numbers to return. +#' Default is 20. +#' @param query A string for filtering a specific query name. If it is not +#' "All", only the data matching this query will be processed. #' #' @importFrom dplyr arrange filter group_by select summarise #' @importFrom shiny showNotification @@ -56,10 +68,16 @@ themeGenes2 <- function() { #' @importFrom rlang sym #' @importFrom rlang .data #' -#' @return +#' @return A vector of the top N accession numbers (`AccNum`) based on counts +#' grouped by lineage and domain architecture. #' @export #' #' @examples +#' \dontrun{ +#' top_accessions <- getTopAccByLinDomArch(infile_full = my_data, +#' DA_col = "DomArch.Pfam", lin_col = "Lineage_short", +#' n = 20, query = "specific_query_name") +#' } getTopAccByLinDomArch <- function(infile_full, DA_col = "DomArch.Pfam", lin_col = "Lineage_short", @@ -95,15 +113,25 @@ getTopAccByLinDomArch <- function(infile_full, ############################################# #' plotIPR2Viz #' -#' @param infile_ipr -#' @param infile_full -#' @param accessions -#' @param analysis -#' @param group_by -#' @param topn -#' @param name -#' @param text_size -#' @param query +#' @param infile_ipr A path to the input IPR file (TSV format) containing +#' domain information. +#' @param infile_full A path to the full input file (TSV format) containing +#' lineage and accession information. +#' @param accessions A character vector of accession numbers to filter the +#' analysis. Default is an empty vector. +#' @param analysis A character vector specifying the types of analysis to +#' include (e.g., "Pfam", "Phobius", "TMHMM", "Gene3D"). Default is a +#' vector of these analyses. +#' @param group_by A string specifying how to group the visualization. +#' Default is "Analysis". Options include "Analysis" or "Query". +#' @param topn An integer specifying the number of top accessions to visualize. +#' Default is 20. +#' @param name A string representing the name to use for y-axis labels. +#' Default is "Name". +#' @param text_size An integer specifying the text size for the plot. +#' Default is 15. +#' @param query A string for filtering a specific query name. If it is not +#' "All", only the data matching this query will be processed. #' #' @importFrom dplyr distinct filter select #' @importFrom gggenes geom_gene_arrow geom_subgene_arrow @@ -112,10 +140,22 @@ getTopAccByLinDomArch <- function(infile_full, #' @importFrom tidyr pivot_wider #' @importFrom stats as.formula #' -#' @return +#' @return A ggplot object representing the domain architecture visualization. #' @export #' #' @examples +#' \dontrun{ +#' plot <- plotIPR2Viz(infile_ipr = "path/to/ipr_file.tsv", +#' infile_full = "path/to/full_file.tsv", +#' accessions = c("ACC123", "ACC456"), +#' analysis = c("Pfam", "TMHMM"), +#' group_by = "Analysis", +#' topn = 20, +#' name = "Gene Name", +#' text_size = 15, +#' query = "All") +#' plot +#' } plotIPR2Viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(), analysis = c("Pfam", "Phobius", "TMHMM", "Gene3D"), group_by = "Analysis", # "Analysis" @@ -251,15 +291,25 @@ plotIPR2Viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(), #' plotIPR2VizWeb #' -#' @param infile_ipr -#' @param accessions -#' @param analysis -#' @param group_by -#' @param name -#' @param text_size -#' @param legend_name -#' @param cols -#' @param rows +#' @param infile_ipr A path to the input IPR file (TSV format) containing +#' domain information. +#' @param accessions A character vector of accession numbers to filter the +#' analysis. +#' @param analysis A character vector specifying the types of analysis to +#' include (e.g., "Pfam", "Phobius", "TMHMM", "Gene3D"). Default is a vector +#' of these analyses. +#' @param group_by A string specifying how to group the visualization. +#' Default is "Analysis". Options include "Analysis" or "Query". +#' @param name A string representing the name to use for y-axis labels. +#' Default is "Name". +#' @param text_size An integer specifying the text size for the plot. +#' Default is 15. +#' @param legend_name A string representing the column to use for legend labels. +#' Default is "ShortName". +#' @param cols An integer specifying the number of columns in the facet wrap. +#' Default is 5. +#' @param rows An integer specifying the number of rows in the legend. +#' Default is 10. #' #' @importFrom dplyr arrange distinct filter select #' @importFrom gggenes geom_gene_arrow geom_subgene_arrow @@ -267,10 +317,23 @@ plotIPR2Viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(), #' @importFrom readr read_tsv #' @importFrom tidyr pivot_wider #' -#' @return +#' @return A ggplot object representing the domain architecture visualization +#' for web display. #' @export #' #' @examples +#' \dontrun{ +#' plot <- plotIPR2VizWeb(infile_ipr = "path/to/ipr_file.tsv", +#' accessions = c("ACC123", "ACC456"), +#' analysis = c("Pfam", "TMHMM"), +#' group_by = "Analysis", +#' name = "Gene Name", +#' text_size = 15, +#' legend_name = "ShortName", +#' cols = 5, +#' rows = 10) +#' plot +#' } plotIPR2VizWeb <- function(infile_ipr, accessions, analysis = c("Pfam", "Phobius", "TMHMM", "Gene3D"), diff --git a/R/lineage.R b/R/lineage.R index ef4fe586..46249c91 100644 --- a/R/lineage.R +++ b/R/lineage.R @@ -11,17 +11,24 @@ #' #' @author Samuel Chen, Janani Ravi #' -#' @param outpath String of path where the assembly summary file should be written -#' @param keep Character vector containing which columns should be retained and downloaded +#' @param outpath String of path where the assembly summary file should be +#' written +#' @param keep Character vector containing which columns should be retained and +#' downloaded #' #' @importFrom data.table fwrite setnames #' @importFrom dplyr bind_rows select #' @importFrom biomartr getKingdomAssemblySummary #' -#' @return +#' @return A tab-separated file containing the assembly summary. The function +#' does notreturn any value but writes the output directly to the specified file. #' @export #' #' @examples +#' \dontrun{ +#' downloadAssemblySummary(outpath = "assembly_summary.tsv", +#' keep = c("assembly_accession", "taxid", "organism_name")) +#' } downloadAssemblySummary <- function(outpath, keep = c( "assembly_accession", "taxid", @@ -78,15 +85,24 @@ downloadAssemblySummary <- function(outpath, #' @param lineagelookup_path String of the path to the lineage lookup file #' (taxid to lineage mapping). This file can be generated using the #' "createLineageLookup()" function -#' @param acc_col +#' @param acc_col Character. The name of the column in `prot_data` containing +#' accession numbers. Default is "AccNum". #' #' @importFrom dplyr pull #' @importFrom data.table fread setnames #' -#' @return +#' @return A dataframe containing the merged information of GCA_IDs, TaxIDs, +#' and their corresponding lineage up to the phylum level. The dataframe +#' will include information from the input `prot_data` and lineage data. +#' #' @export #' #' @examples +#' \dontrun{ +#' result <- GCA2Lineage(prot_data = my_prot_data, +#' assembly_path = "path/to/assembly_summary.txt", +#' lineagelookup_path = "path/to/lineage_lookup.tsv") +#' } GCA2Lineage <- function(prot_data, assembly_path = "/data/research/jravilab/common_data/assembly_summary_genbank.txt", lineagelookup_path = "/data/research/jravilab/common_data/lineage_lookup.tsv", @@ -135,20 +151,34 @@ GCA2Lineage <- function(prot_data, ################################### #' addLineage #' -#' @param df -#' @param acc_col -#' @param assembly_path -#' @param lineagelookup_path -#' @param ipgout_path -#' @param plan +#' @param df Dataframe containing accession numbers. The dataframe should +#' have a column specified by `acc_col` that contains these accession numbers. +#' @param acc_col Character. The name of the column in `df` containing +#' accession numbers. Default is "AccNum". +#' @param assembly_path String. The path to the assembly summary file generated +#' using the `downloadAssemblySummary()` function. +#' @param lineagelookup_path String. The path to the lineage lookup file (taxid +#' to lineage mapping) generated using the `create_lineage_lookup()` function. +#' @param ipgout_path String. Optional path to save intermediate output files. +#' Default is NULL. +#' @param plan Character. Specifies the execution plan for parallel processing. +#' Default is "multicore". #' #' @importFrom dplyr pull #' @importFrom rlang sym #' -#' @return +#' @return A dataframe that combines the original dataframe `df` with lineage +#' information retrieved based on the provided accession numbers. +#' #' @export #' #' @examples +#' \dontrun{ +#' enriched_df <- addLineage(df = my_data, +#' acc_col = "AccNum", +#' assembly_path = "path/to/assembly_summary.txt", +#' lineagelookup_path = "path/to/lineage_lookup.tsv") +#' } addLineage <- function(df, acc_col = "AccNum", assembly_path, lineagelookup_path, ipgout_path = NULL, plan = "multicore") { acc_sym <- sym(acc_col) @@ -194,12 +224,23 @@ addLineage <- function(df, acc_col = "AccNum", assembly_path, #' (taxid to lineage mapping). This file can be generated using the #' @param ipgout_path Path to write the results of the efetch run of the accessions #' on the ipg database. If NULL, the file will not be written. Defaults to NULL -#' @param plan +#' @param plan Character. Specifies the execution plan for parallel processing. +#' Default is "multicore". #' -#' @return +#' @return A dataframe containing lineage information mapped to the given protein +#' accessions. The dataframe includes relevant columns such as TaxID, GCA_ID, +#' Protein, Protein Name, Species, and Lineage. #' @export #' #' @examples +#' \dontrun{ +#' lineage_data <- acc2Lineage( +#' accessions = c("P12345", "Q67890"), +#' assembly_path = "path/to/assembly_summary.txt", +#' lineagelookup_path = "path/to/lineage_lookup.tsv", +#' ipgout_path = "path/to/output.txt" +#' ) +#' } acc2Lineage <- function(accessions, assembly_path, lineagelookup_path, ipgout_path = NULL, plan = "multicore") { tmp_ipg <- F @@ -235,16 +276,25 @@ acc2Lineage <- function(accessions, assembly_path, lineagelookup_path, #' @param accessions Character vector containing the accession numbers to query on #' the ipg database #' @param out_path Path to write the efetch results to -#' @param plan +#' @param plan Character. Specifies the execution plan for parallel processing. +#' Default is "multicore". #' #' @importFrom future future plan #' @importFrom purrr map #' @importFrom rentrez entrez_fetch #' -#' @return +#' @return The function does not return a value but writes the efetch results +#' directly to the specified `out_path`. +#' #' @export #' #' @examples +#' \dontrun{ +#' efetchIPG( +#' accessions = c("P12345", "Q67890", "A12345"), +#' out_path = "path/to/efetch_results.xml" +#' ) +#' } efetchIPG <- function(accessions, out_path, plan = "multicore") { if (length(accessions) > 0) { partition <- function(v, groups) { @@ -305,18 +355,28 @@ efetchIPG <- function(accessions, out_path, plan = "multicore") { #' @param ipg_file Path to the file containing results of an efetch run on the #' ipg database. The protein accession in 'accessions' should be contained in this #' file -#' @param refseq_assembly_path -#' @param genbank_assembly_path +#' @param refseq_assembly_path String. Path to the RefSeq assembly summary file. +#' @param genbank_assembly_path String. Path to the GenBank assembly summary file. #' @param lineagelookup_path String of the path to the lineage lookup file #' (taxid to lineage mapping). This file can be generated using the #' "createLineageLookup()" function #' #' @importFrom data.table fread setnames #' -#' @return +#' @return A data table containing protein accessions along with their +#' corresponding TaxIDs and lineage information. #' @export #' #' @examples +#' \dontrun{ +#' lins <- IPG2Lineage( +#' accessions = c("P12345", "Q67890"), +#' ipg_file = "path/to/ipg_results.txt", +#' refseq_assembly_path = "path/to/refseq_assembly_summary.txt", +#' genbank_assembly_path = "path/to/genbank_assembly_summary.txt", +#' lineagelookup_path = "path/to/lineage_lookup.tsv" +#' ) +#' } IPG2Lineage <- function(accessions, ipg_file, refseq_assembly_path, genbank_assembly_path, lineagelookup_path) { @@ -383,16 +443,25 @@ IPG2Lineage <- function(accessions, ipg_file, ######################################### #' addTaxID #' -#' @param data -#' @param acc_col -#' @param version +#' @param data A data frame or data table containing protein accession numbers. +#' @param acc_col A string specifying the column name in `data` that contains +#' the accession numbers. Defaults to "AccNum". +#' @param version A logical indicating whether to remove the last two characters +#' from the accession numbers for TaxID retrieval. Defaults to TRUE. #' #' @importFrom data.table as.data.table #' -#' @return +#' @return A data table that includes the original data along with a new column +#' containing the corresponding TaxIDs. #' @export #' #' @examples +#' \dontrun{ +#' # Create a sample data table with accession numbers +#' sample_data <- data.table(AccNum = c("ABC123.1", "XYZ456.1", "LMN789.2")) +#' enriched_data <- addTaxID(sample_data, acc_col = "AccNum", version = TRUE) +#' enriched_data +#' } addTaxID <- function(data, acc_col = "AccNum", version = T) { if (!is.data.table(data)) { data <- as.data.table(data) @@ -421,17 +490,30 @@ addTaxID <- function(data, acc_col = "AccNum", version = T) { ################################## #' proteinAcc2TaxID #' -#' @param accnums -#' @param suffix -#' @param out_path -#' @param return_dt +#' @param accnums A character vector of protein accession numbers to be mapped +#' to TaxIDs. +#' @param suffix A string suffix used to name the output file generated by the +#' script. +#' @param out_path A string specifying the directory where the output file will +#' be saved. +#' @param return_dt A logical indicating whether to return the result as a data +#' table. Defaults to FALSE. If TRUE, the output file is read into a data table +#' and returned. #' #' @importFrom data.table fread #' -#' @return +#' @return If `return_dt` is TRUE, a data table containing the mapping of protein +#' accession numbers to TaxIDs. If FALSE, the function returns NULL. #' @export #' #' @examples +#' \dontrun{ +#' # Example accession numbers +#' accessions <- c("ABC123", "XYZ456", "LMN789") +#' tax_data <- proteinAcc2TaxID(accessions, suffix = "example", +#' out_path = "/path/to/output", return_dt = TRUE) +#' tax_data +#' } proteinAcc2TaxID <- function(accnums, suffix, out_path, return_dt = FALSE) { # Write accnums to a file acc_file <- tempfile() @@ -456,18 +538,25 @@ proteinAcc2TaxID <- function(accnums, suffix, out_path, return_dt = FALSE) { #' @description Perform elink to go from protein database to taxonomy database #' and write the resulting file of taxid and lineage to out_path #' -#' @param accessions Character vector containing the accession numbers to query on -#' the ipg database -#' @param out_path Path to write the efetch results to -#' @param plan +#' @param accessions A character vector containing the accession numbers to query +#' in the protein database. +#' @param out_path A string specifying the path where the results of the query +#' will be written. If set to NULL, a temporary directory will be used. +#' @param plan A character string that specifies the execution plan for parallel +#' processing. The default is "multicore". #' #' @importFrom future plan #' @importFrom purrr map #' -#' @return +#' @return This function does not return a value. It writes the results to the +#' specified output path. #' @export #' #' @examples +#' \dontrun{ +#' accessions <- c("ABC123", "XYZ456", "LMN789") +#' proteinAcc2TaxID_old(accessions, out_path = "/path/to/output") +#' } proteinAcc2TaxID_old <- function(accessions, out_path, plan = "multicore") { if (length(accessions) > 0) { partition <- function(v, groups) { diff --git a/R/msa.R b/R/msa.R index 302359e0..8838fdda 100644 --- a/R/msa.R +++ b/R/msa.R @@ -50,12 +50,15 @@ #' @importFrom msa msa msaPrettyPrint #' @importFrom stringr str_replace #' -#' @return +#' @return A PDF file containing the multiple sequence alignment. #' @export #' #' @examples #' \dontrun{ -#' createMSA_PDF() +#' createMSA_PDF(fasta_path = "path/to/your/file.fasta", +#' out_path = "path/to/output/alignment.pdf", +#' lowerbound = 10, +#' upperbound = 200) #' } createMSA_PDF <- function(fasta_path, out_path = NULL, lowerbound = NULL, upperbound = NULL) { @@ -187,15 +190,21 @@ createMSA_PDF <- function(fasta_path, out_path = NULL, ## https://github.com/mhahsler/rMSA #' Function to generate MSA using kalign #' -#' @param fa_file -#' @param outfile +#' @param fa_file Character. The path to the input FASTA file containing protein +#' sequences. +#' @param outfile Character. The path to the output file where the alignment +#' will be saved. #' #' @importFrom Biostrings readAAStringSet #' -#' @return +#' @return A list containing the alignment object and the output file path. #' @export #' #' @examples +#' \dontrun{ +#' createMSA_Kalign(fa_file = "path/to/sequences.fasta", +#' outfile = "path/to/alignment.txt") +#' } createMSA_Kalign <- function(fa_file = "", outfile = "") { prot_aa <- readAAStringSet( path = fa_file, diff --git a/R/networks_domarch.R b/R/networks_domarch.R index 9215aa93..ae6fe8be 100755 --- a/R/networks_domarch.R +++ b/R/networks_domarch.R @@ -24,13 +24,17 @@ #' A network of domains is returned based on shared domain architectures. #' #' @param prot A data frame that contains the column 'DomArch'. -#' @param column Name of column containing Domain architecture from which nodes and edges are generated. -#' @param domains_of_interest -#' @param cutoff Integer. Only use domains that occur at or above the cutoff for total counts if cutoff_type is "Total Count". -#' Only use domains that appear in cutoff or greater lineages if cutoff_type is Lineage. +#' @param column Name of column containing Domain architecture from which nodes +#' and edges are generated. +#' @param domains_of_interest Character vector specifying domains of interest. +#' @param cutoff Integer. Only use domains that occur at or above the cutoff for +#' total counts if cutoff_type is "Total Count". +#' Only use domains that appear in cutoff or greater lineages if cutoff_type is +#' Lineage. #' @param layout Character. Layout type to be used for the network. Options are: #' \itemize{\item "grid" \item "circle" \item "random" \item "auto"} -#' @param query_color +#' @param query_color Character. Color to represent the queried domain in the +#' network. #' #' @importFrom dplyr across add_row all_of distinct filter mutate pull select #' @importFrom igraph delete_vertices graph_from_edgelist vertex @@ -41,7 +45,7 @@ #' @importFrom tidyr pivot_wider #' @importFrom visNetwork visIgraph visIgraphLayout visNetwork visOptions #' -#' @return +#' @return A network visualization of domain architectures. #' @export #' #' @examples @@ -227,15 +231,20 @@ createDomainNetwork <- function(prot, column = "DomArch", domains_of_interest, c #' #' #' @param prot A data frame that contains the column 'DomArch'. -#' @param column Name of column containing Domain architecture from which nodes and edges are generated. -#' @param domains_of_interest -#' @param cutoff Integer. Only use domains that occur at or above the cutoff for total counts if cutoff_type is "Total Count". -#' Only use domains that appear in cutoff or greater lineages if cutoff_type is Lineage. +#' @param column Name of column containing Domain architecture from which nodes +#' and edges are generated. +#' @param domains_of_interest Character vector specifying the domains of interest. +#' @param cutoff Integer. Only use domains that occur at or above the cutoff for +#' total counts if cutoff_type is "Total Count". +#' Only use domains that appear in cutoff or greater lineages if cutoff_type is +#' Lineage. #' @param layout Character. Layout type to be used for the network. Options are: #' \itemize{\item "grid" \item "circle" \item "random" \item "auto"} -#' @param query_color Color that the nodes of the domains in the domains_of_interest vector are colored -#' @param partner_color Color that the nodes that are not part of the domains_of_interest vector are colored -#' @param border_color +#' @param query_color Color that the nodes of the domains in the +#' domains_of_interest vector are colored +#' @param partner_color Color that the nodes that are not part of the +#' domains_of_interest vector are colored +#' @param border_color Color for the borders of the nodes. #' @param IsDirected Is the network directed? Set to false to eliminate arrows #' #' @importFrom dplyr distinct filter group_by mutate pull select summarize @@ -245,12 +254,12 @@ createDomainNetwork <- function(prot, column = "DomArch", domains_of_interest, c #' @importFrom stringr str_replace_all str_split #' @importFrom visNetwork visEdges visGroups visIgraphLayout visLegend visNetwork visOptions #' -#' @return +#' @return A network visualization of domain architectures. #' @export #' #' @examples #' \dontrun{ -#' createDomainNetwork(pspa) +#' createBinaryDomainNetwork(pspa) #' } createBinaryDomainNetwork <- function(prot, column = "DomArch", domains_of_interest, cutoff = 70, layout = "nice", query_color = adjustcolor("yellow", alpha.f = .5), diff --git a/R/networks_gencontext.R b/R/networks_gencontext.R index ca1ef52d..6ed19b1e 100755 --- a/R/networks_gencontext.R +++ b/R/networks_gencontext.R @@ -17,13 +17,19 @@ #' #' #' @param prot A data frame that contains the column 'DomArch'. -#' @param column Name of column containing Domain architecture from which nodes and edges are generated. -#' @param domains_of_interest -#' @param cutoff_type Character. Used to determine how data should be filtered. Either -#' \itemize{\item "Lineage" to filter domains based off how many lineages the Domain architecture appears in -#' \item "Total Count" to filter off the total amount of times a domain architecture occurs } -#' @param cutoff Integer. Only use domains that occur at or above the cutoff for total counts if cutoff_type is "Total Count". -#' Only use domains that appear in cutoff or greater lineages if cutoff_type is Lineage. +#' @param column Name of column containing Domain architecture from which nodes +#' and edges are generated. +#' @param domains_of_interest Character vector specifying the domains of interest. +#' @param cutoff_type Character. Used to determine how data should be filtered. +#' Either +#' \itemize{\item "Lineage" to filter domains based off how many lineages the +#' Domain architecture appears in +#' \item "Total Count" to filter off the total amount of times a +#' domain architecture occurs } +#' @param cutoff Integer. Only use domains that occur at or above the cutoff +#' for total counts if cutoff_type is "Total Count". +#' Only use domains that appear in cutoff or greater lineages if cutoff_type is +#' Lineage. #' @param layout Character. Layout type to be used for the network. Options are: #' \itemize{\item "grid" \item "circle" \item "random" \item "auto"} #' @@ -32,12 +38,14 @@ #' @importFrom igraph E graph_from_edgelist layout.auto layout.circle layout_on_grid layout_randomly plot.igraph V #' @importFrom stringr str_replace_all str_split #' -#' @return +#' @return A plot of the domain architecture network. #' @export #' #' @examples #' \dontrun{ -#' createUndirectedGenomicContextNetwork(pspa) +#' createUndirectedGenomicContextNetwork(pspa, column = "DomArch", +#' domains_of_interest = c("Domain1", "Domain2"), +#' cutoff_type = "Total Count", cutoff = 10) #' } createUndirectedGenomicContextNetwork <- function(prot, column = "GenContext", domains_of_interest, cutoff_type = "Lineage", cutoff = 1, layout = "grid") { # by domain networks or all, as required. @@ -127,8 +135,10 @@ createUndirectedGenomicContextNetwork <- function(prot, column = "GenContext", d #' #' @param prot A data frame that contains the column 'GenContext'. #' @param domains_of_interest Character vector of domains of interest. -#' @param column Name of column containing Genomic Context from which nodes and edges are generated. -#' @param cutoff Integer. Only use GenContexts that occur at or above the cutoff percentage for total count +#' @param column Name of column containing Genomic Context from which nodes and +#' edges are generated. +#' @param cutoff Integer. Only use GenContexts that occur at or above the cutoff +#' percentage for total count #' @param layout Character. Layout type to be used for the network. Options are: #' \itemize{\item "grid" \item "circle" \item "random" \item "auto" \item "nice"} #' @param directed Is the network directed? @@ -139,12 +149,12 @@ createUndirectedGenomicContextNetwork <- function(prot, column = "GenContext", d #' @importFrom stringr str_replace_all #' @importFrom visNetwork visIgraphLayout visLegend visNetwork visOptions #' -#' @return +#' @return A plot of the genomic context network. #' @export #' #' @examples #' \dontrun{ -#' gc_directed_network(pspa, column = "GenContex", cutoff = 55) +#' gc_directed_network(pspa, column = "GenContext", cutoff = 55) #' } createGenomicContextNetwork <- function(prot, domains_of_interest, column = "GenContext", cutoff = 40, diff --git a/R/plotme.R b/R/plotme.R index 906e85ec..3cfd54f8 100644 --- a/R/plotme.R +++ b/R/plotme.R @@ -44,10 +44,9 @@ plotSunburst <- function(count_data, fill_by_n = FALSE, sort_by_n = FALSE, maxde } -#' @param count_data -#' -#' @param fill_by_n -#' @param sort_by_n +#' @param count_data A data frame containing the data. +#' @param fill_by_n Logical indicating if fill color is based on counts. +#' @param sort_by_n Logical indicating if data should be sorted by counts. #' #' @importFrom plotly plot_ly #' @importFrom purrr exec @@ -68,18 +67,24 @@ plotTreemap <- function(count_data, fill_by_n = FALSE, sort_by_n = FALSE) { #' prepareColumnParams #' -#' @param count_data -#' @param fill_by_n -#' @param sort_by_n +#' @param count_data A data frame containing the data. +#' @param fill_by_n Logical indicating if fill color is based on counts. +#' @param sort_by_n Logical indicating if data should be sorted by counts. #' #' @importFrom assertthat assert_that #' @importFrom dplyr bind_rows mutate #' @importFrom purrr map #' -#' @return +#' @return A data frame of parameters for treemap visualization. #' @export #' #' @examples +#' \dontrun{ +#' count_data <- data.frame(Category = c("A", "B", "C"), +#' n = c(10, 20, 15)) +#' params <- prepareColumnParams(count_data, fill_by_n = TRUE, sort_by_n = FALSE) +#' params +#' } prepareColumnParams <- function(count_data, fill_by_n, sort_by_n) { validateCountDF(count_data) assertthat::assert_that(is.logical(fill_by_n), @@ -116,17 +121,24 @@ prepareColumnParams <- function(count_data, fill_by_n, sort_by_n) { #' prepareSingleColumnParams #' -#' @param df -#' @param col_num -#' @param root +#' @param df A data frame containing the data to be processed. +#' @param col_num An integer representing the column number to process. +#' @param root A string representing the root node for the treemap. #' #' @importFrom dplyr c_across group_by mutate rowwise select summarise ungroup #' @importFrom stringr str_glue #' -#' @return +#' @return A data frame containing parameters for the specified column for +#' treemap visualization. #' @export #' #' @examples +#' \dontrun{ +#' df <- data.frame(Category = c("A", "A", "B", "B", "C"), +#' n = c(10, 20, 30, 40, 50)) +#' params <- prepareSingleColumnParams(df, col_num = 1, root = "Root") +#' params +#' } prepareSingleColumnParams <- function(df, col_num, root) { @@ -158,15 +170,18 @@ prepareSingleColumnParams <- function(df, } #' validateCountDF #' -#' @param var +#' @param var A data frame whose columns are to be converted. #' #' @importFrom assertthat assert_that has_name #' @importFrom dplyr across mutate #' -#' @return +#' @return A data frame with non-'n' columns converted to character type. #' @export #' #' @examples +#' \dontrun{ +#' new_df <- .all_non_n_cols_to_char(my_data) +#' } validateCountDF <- function(var) { msg <- paste(substitute(var), "must be a count dataframe (output of dplyr::count)") assertthat::assert_that(is.data.frame(var), diff --git a/R/plotting.R b/R/plotting.R index 62bfae74..14e007f3 100644 --- a/R/plotting.R +++ b/R/plotting.R @@ -51,20 +51,67 @@ } -#' Shorten Lineage +######################## +## Internal Functions ## +######################## +#' +#' +.LevelReduction <- function(lin, level) { + gt_loc <- str_locate_all(lin, ">")[[1]] + available_levels <- length(gt_loc) / 2 # Since `str_locate_all` returns a matrix + + # Guard against out-of-bounds level requests + if (level > available_levels || level < 1) { + return(lin) + } else { + gt_loc <- gt_loc[level, ][1] %>% as.numeric() + lin <- substring(lin, first = 0, last = (gt_loc - 1)) + return(lin) + } +} + + + +.GetKingdom <- function(lin) { + gt_loc <- str_locate(lin, ">")[, "start"] + if (is.na(gt_loc)) { + # No '>' in lineage + return(lin) + } else { + kingdom <- substring(lin, first = 0, last = (gt_loc - 1)) + return(kingdom) + } +} + + +#' shortenLineage #' -#' @param data -#' @param colname -#' @param abr_len +#' @description +#' This function abbreviates lineage names by shortening the first part of the +#' string (up to a given delimiter). +#' +#' @param data A data frame that contains a column with lineage names to be +#' shortened. +#' @param colname Character. The name of the column in the data frame containing +#' the lineage strings to be shortened. Default is `"Lineage"`. +#' @param abr_len Integer. The number of characters to retain after the first +#' letter. If set to 1, only the first letter of each segment before the +#' delimiter (`>`) is retained. Default is 1. #' #' @importFrom stringr str_locate +#' @importFrom purrr pmap +#' +#' @return A modified data frame where the specified lineage column has been +#' shortened. #' -#' @return #' @export #' #' @examples #' \dontrun{ -#' shortenLineage() +#' df <- data.frame(Lineage = c("Bacteria>Firmicutes>Clostridia", +#' "Archaea>Euryarchaeota>Thermococci")) +#' shortened_df <- shortenLineage(df, colname = "Lineage", abr_len = 1) +#' shortened_df #' } shortenLineage <- function(data, colname = "Lineage", abr_len = 1) { abbrv <- function(x) { @@ -98,23 +145,29 @@ shortenLineage <- function(data, colname = "Lineage", abr_len = 1) { #' #' @param query_data Data frame of protein homologs with the usual 11 columns + #' additional word columns (0/1 format). Default is toast_rack.sub -#' @param colname +#' @param colname Column name from query_data: "DomArch.norep", "GenContext.norep", +#' "DomArch.PFAM.norep" or "DomArch.LADB.norep". Default is "DomArch.norep". #' @param cutoff Numeric. Cutoff for word frequency. Default is 90. -#' @param RowsCutoff -#' @param text.scale Allows scaling of axis title, tick lables, and numbers above the intersection size bars. +#' @param RowsCutoff Boolean. If TRUE, applies a row cutoff to remove data rows +#' based on a certain condition. Default is FALSE. +#' @param text.scale Allows scaling of axis title, tick lables, and numbers +#' above the intersection size bars. #' text.scale can either take a universal scale in the form of an integer, #' or a vector of specific scales in the format: c(intersection size title, #' intersection size tick labels, set size title, set size tick labels, set names, #' numbers above bars) -#' @param point.size -#' @param line.size +#' @param point.size Numeric. Sets the size of points in the UpSet plot. +#' Default is 2.2. +#' @param line.size Numeric. Sets the line width in the UpSet plot. +#' Default is 0.8. #' #' @importFrom dplyr across distinct filter if_else mutate pull select where #' @importFrom rlang sym #' @importFrom stringr str_detect str_replace_all str_split #' @importFrom UpSetR upset #' -#' @return +#' @return An UpSet plot object. The plot visualizes intersections of sets based +#' on the provided colname in query_data. #' @export #' #' @note Please refer to the source code if you have alternate file formats and/or @@ -263,8 +316,9 @@ plotUpSet <- function(query_data = "toast_rack.sub", #' Default is prot (variable w/ protein data). #' @param colname Column name from query_data: "DomArch.norep", "GenContext.norep", #' "DomArch.PFAM.norep" or "DomArch.LADB.norep". Default is "DomArch.norep". -#' @param cutoff -#' @param RowsCutoff +#' @param cutoff Numeric. Cutoff for word frequency. Default is 90. +#' @param RowsCutoff Boolean. If TRUE, applies a row cutoff to remove data rows +#' based on a certain condition. Default is FALSE. #' @param color Color for the heatmap. One of six options: "default", "magma", "inferno", #' "plasma", "viridis", or "cividis" #' @@ -276,7 +330,7 @@ plotUpSet <- function(query_data = "toast_rack.sub", #' @importFrom viridis scale_fill_viridis #' @importFrom rlang sym #' -#' @return +#' @return A LineageDA plot object. #' @export #' #' @details @@ -358,7 +412,7 @@ plotLineageDA <- function(query_data = "prot", #' Lineage Plot: Heatmap of Queries vs Lineages #' -#' @authors Janani Ravi, Samuel Chen +#' @author Janani Ravi, Samuel Chen #' @keywords Lineages, Domains, Domain Architectures, GenomicContexts #' @description #' Lineage plot for queries. Heatmap. @@ -366,10 +420,14 @@ plotLineageDA <- function(query_data = "prot", #' @param query_data Data frame of protein homologs with the usual 11 columns + #' additional word columns (0/1 format). #' Default is prot (variable w/ protein data). -#' @param queries Character Vector containing the queries that will be used for the categories -#' @param colname -#' @param cutoff -#' @param color +#' @param queries Character Vector containing the queries that will be used for +#' the categories. +#' @param colname Character. The column used for filtering based on the `queries`. +#' Default is "ClustName". +#' @param cutoff Numeric. The cutoff value for filtering rows based on their +#' total count. Rows with values below this cutoff are excluded. +#' @param color Character. Defines the color palette used for the heatmap. +#' Default is a red gradient. #' #' @importFrom dplyr arrange desc filter group_by select summarise union #' @importFrom ggplot2 aes aes_string element_rect element_text geom_tile ggplot scale_fill_gradient scale_x_discrete theme theme_minimal @@ -379,7 +437,9 @@ plotLineageDA <- function(query_data = "prot", #' @importFrom tidyr drop_na #' @importFrom viridis scale_fill_viridis #' -#' @return +#' @return A ggplot object representing a heatmap (tile plot) showing the +#' relationship between queries and lineages, with the intensity of color +#' representing the count of matching records. #' @export #' #' @note @@ -509,7 +569,9 @@ plotLineageQuery <- function(query_data = all, #' @importFrom stringr str_replace_all #' @importFrom tidyr gather #' -#' @return +#' @return A ggplot object representing a heatmap (tile plot) of lineage versus +#' the top neighboring domain architectures, with color intensity representing +#' the frequency of occurrences. #' @export #' #' @details @@ -587,15 +649,19 @@ plotLineageNeighbors <- function(query_data = "prot", query = "pspa", #' Lineage Domain Repeats Plot #' -#' @param query_data -#' @param colname +#' @param query_data Data frame containing protein homolog data, including +#' relevant domain architectures and lineages. +#' @param colname Character. The name of the column in query_data that contains +#' domain architectures or other structural information. #' #' @importFrom dplyr across mutate select where #' @importFrom ggplot2 aes element_text geom_tile ggplot scale_fill_gradient scale_x_discrete theme theme_minimal #' @importFrom stringr str_count str_replace_all #' @importFrom tidyr gather #' -#' @return +#' @return A ggplot object representing a heatmap (tile plot) of domain repeat +#' counts across different lineages, with color intensity representing the +#' occurrence of domains. #' @export #' #' @examples @@ -679,7 +745,9 @@ plotLineageDomainRepeats <- function(query_data, colname) { #' @importFrom purrr map #' @importFrom stringr str_locate str_locate_all #' -#' @return +#' @return A ggplot object representing a heatmap (tile plot) of domain repeat +#' counts across different lineages, with color intensity representing the +#' occurrence of domains. #' @export #' #' @examples @@ -792,25 +860,35 @@ plotLineageHeatmap <- function(prot, domains_of_interest, level = 3, label.size #' Stacked Lineage Plot #' -#' @param prot -#' @param column -#' @param cutoff -#' @param Lineage_col -#' @param xlabel -#' @param reduce_lineage -#' @param label.size -#' @param legend.position -#' @param legend.text.size -#' @param legend.cols -#' @param legend.size -#' @param coord_flip -#' @param legend +#' @param prot Data frame containing protein data including domain architecture +#' and lineage information. +#' @param column Character. The name of the column in prot representing domain +#' architectures (default is "DomArch"). +#' @param cutoff Numeric. A threshold value for filtering domain architectures +#' or protein counts. +#' @param Lineage_col Character. The name of the column representing lineage +#' data (default is "Lineage"). +#' @param xlabel Character. Label for the x-axis +#' (default is "Domain Architecture"). +#' @param reduce_lineage Logical. Whether to shorten lineage names +#' (default is TRUE). +#' @param label.size Numeric. The size of axis text labels (default is 8). +#' @param legend.position Numeric vector. Coordinates for placing the legend +#' (default is c(0.7, 0.4)). +#' @param legend.text.size Numeric. Size of the text in the legend +#' (default is 10). +#' @param legend.cols Numeric. Number of columns in the legend (default is 2). +#' @param legend.size Numeric. Size of the legend keys (default is 0.7). +#' @param coord_flip Logical. Whether to flip the coordinates of the plot +#' (default is TRUE). +#' @param legend Logical. Whether to display the legend (default is TRUE). #' #' @importFrom dplyr pull select #' @importFrom ggplot2 aes_string coord_flip element_blank element_line element_rect element_text geom_bar ggplot guides guide_legend scale_fill_manual xlab ylab theme theme_minimal #' @importFrom purrr map #' -#' @return +#' @return A ggplot object representing a stacked bar plot showing the +#' distribution of protein domain architectures across lineages. #' @export #' #' @examples @@ -938,31 +1016,46 @@ plotStackedLineage <- function(prot, column = "DomArch", cutoff, Lineage_col = " #' plotWordCloud3 #' -#' @param data -#' @param size -#' @param minSize -#' @param gridSize -#' @param fontFamily -#' @param fontWeight -#' @param color -#' @param backgroundColor -#' @param minRotation -#' @param maxRotation -#' @param shuffle -#' @param rotateRatio -#' @param shape -#' @param ellipticity -#' @param widgetsize -#' @param figPath -#' @param hoverFunction +#' @param data Data frame or table containing words and their frequencies for +#' the word cloud. +#' @param size Numeric. Scaling factor for word sizes (default is 1). +#' @param minSize Numeric. Minimum font size for the smallest word +#' (default is 0). +#' @param gridSize Numeric. Size of the grid for placing words (default is 0). +#' @param fontFamily Character. Font family to use for the words +#' (default is "Segoe UI"). +#' @param fontWeight Character. Font weight for the words (default is "bold"). +#' @param color Character or vector. Color of the words. Use "random-dark" for +#' random dark colors (default) or specify a color. +#' @param backgroundColor Character. Background color of the word cloud +#' (default is "white"). +#' @param minRotation Numeric. Minimum rotation angle of words in radians +#' (default is -π/4). +#' @param maxRotation Numeric. Maximum rotation angle of words in radians +#' (default is π/4). +#' @param shuffle Logical. Whether to shuffle the words (default is TRUE). +#' @param rotateRatio Numeric. Proportion of words that are rotated +#' (default is 0.4). +#' @param shape Character. Shape of the word cloud ("circle" is default, but +#' you can use "cardioid", "star", "triangle", etc.). +#' @param ellipticity Numeric. Degree of ellipticity (default is 0.65). +#' @param widgetsize Numeric vector. Width and height of the widget +#' (default is NULL, which uses default size). +#' @param figPath Character. Path to an image file to use as a mask for the +#' word cloud (optional). +#' @param hoverFunction JS function. JavaScript function to run when hovering +#' over words (optional). #' #' @importFrom base64enc base64encode #' @importFrom htmlwidgets createWidget JS sizingPolicy #' -#' @return +#' @return An HTML widget object displaying a word cloud. #' @export #' #' @examples +#' \dontrun{ +#' wordcloud3(data = your_data, size = 1.5, color = "random-light") +#' } wordcloud3 <- function(data, size = 1, minSize = 0, gridSize = 0, fontFamily = "Segoe UI", fontWeight = "bold", color = "random-dark", backgroundColor = "white", minRotation = -pi / 4, maxRotation = pi / 4, shuffle = TRUE, @@ -1023,16 +1116,20 @@ wordcloud3 <- function(data, size = 1, minSize = 0, gridSize = 0, fontFamily = " #' #' @param query_data Data frame of protein homologs with the usual 11 columns + #' additional word columns (0/1 format). Default is "prot". -#' @param colname -#' @param cutoff -#' @param UsingRowsCutoff +#' @param colname Character. The name of the column in `query_data` to generate +#' the word cloud from. Default is "DomArch". +#' @param cutoff Numeric. The cutoff value for filtering elements based on their +#' frequency. Default is 70. +#' @param UsingRowsCutoff Logical. Whether to use a row-based cutoff instead of +#' a frequency cutoff. Default is FALSE. #' #' @importFrom dplyr filter pull #' @importFrom RColorBrewer brewer.pal #' @importFrom rlang sym #' @importFrom wordcloud wordcloud #' -#' @return +#' @return A word cloud plot showing the frequency of elements from the selected +#' column. #' @export #' #' @details @@ -1103,14 +1200,18 @@ createWordCloudElement <- function(query_data = "prot", #' #' @param query_data Data frame of protein homologs with the usual 11 columns + #' additional word columns (0/1 format). Default is "prot". -#' @param colname -#' @param cutoff -#' @param UsingRowsCutoff +#' @param colname Character. The name of the column in `query_data` to generate +#' the word cloud from. Default is "DomArch". +#' @param cutoff Numeric. The cutoff value for filtering elements based on their +#' frequency. Default is 70. +#' @param UsingRowsCutoff Logical. Whether to use a row-based cutoff instead of +#' a frequency cutoff. Default is FALSE. #' #' @importFrom dplyr filter pull #' @importFrom rlang sym #' -#' @return +#' @return A word cloud plot showing the frequency of elements from the selected +#' column. #' @export #' #' @details @@ -1173,16 +1274,23 @@ createWordCloud2Element <- function(query_data = "prot", #### Sunburst ##### #' Lineage Sunburst #' -#' @param prot Data frame containing a lineage column that the sunburst plot will be generated for -#' @param lineage_column String. Name of the lineage column within the data frame. Defaults to "Lineage" -#' @param type String, either "sunburst" or "sund2b". If type is "sunburst", a sunburst plot of the lineage +#' @param prot Data frame containing a lineage column that the sunburst plot +#' will be generated for +#' @param lineage_column String. Name of the lineage column within the +#' data frame. Defaults to "Lineage" +#' @param type String, either "sunburst" or "sund2b". If type is "sunburst", +#' a sunburst plot of the lineage #' @param levels Integer. Number of levels the sunburst will have. -#' @param colors -#' @param legendOrder String vector. The order of the legend. If legendOrder is NULL, -#' @param showLegend Boolean. If TRUE, the legend will be enabled when the component first renders. -#' @param maxLevels Integer, the maximum number of levels to display in the sunburst; 5 by default, NULL to disable -#' then the legend will be in the descending order of the top level hierarchy. -#' will be rendered. If the type is sund2b, a sund2b plot will be rendered. +#' @param colors A vector of colors for the sunburst plot. +#' If NULL, default colors are used. +#' @param legendOrder String vector. The order of the legend. If legendOrder +#' is NULL, +#' @param showLegend Boolean. If TRUE, the legend will be enabled when the +#' component first renders. +#' @param maxLevels Integer, the maximum number of levels to display in the +#' sunburst; 5 by default, NULL to disable then the legend will be in the +#' descending order of the top level hierarchy. will be rendered. If the type is +#' sund2b, a sund2b plot will be rendered. #' #' @importFrom d3r d3_nest #' @importFrom dplyr arrange desc group_by_at select summarise @@ -1191,12 +1299,13 @@ createWordCloud2Element <- function(query_data = "prot", #' @importFrom sunburstR sunburst sund2b #' @importFrom tidyr drop_na separate #' -#' @return +#' @return A sunburst or sund2b plot based on the input lineage data. #' @export #' #' @examples #' \dontrun{ -#' plotLineageSunburst() +#' plotLineageSunburst(prot, lineage_column = "Lineage", +#' type = "sunburst", levels = 3) #' } plotLineageSunburst <- function(prot, lineage_column = "Lineage", type = "sunburst", diff --git a/R/pre-msa-tree.R b/R/pre-msa-tree.R index 290a1644..e2a8a39c 100644 --- a/R/pre-msa-tree.R +++ b/R/pre-msa-tree.R @@ -45,10 +45,12 @@ api_key <- Sys.getenv("ENTREZ_API_KEY", unset = "YOUR_KEY_HERE") #' @param x Character vector. #' @param y Delimitter. Default is space (" "). #' -#' @return +#' @return A character vector in title case. #' @export #' #' @examples +#' convert2TitleCase("hello world") +#' convert2TitleCase("this is a test", "_") convert2TitleCase <- function(x, y = " ") { s <- strsplit(x, y)[[1]] paste(toupper(substring(s, 1, 1)), substring(s, 2), @@ -87,7 +89,8 @@ convert2TitleCase <- function(x, y = " ") { #' @importFrom stringr str_sub #' @importFrom tidyr replace_na separate #' -#' @return +#' @return A data frame containing the combined alignment and lineage +#' information. #' @export #' #' @note Please refer to the source code if you have alternate + @@ -188,8 +191,8 @@ addLeaves2Alignment <- function(aln_file = "", #' #' @author Samuel Chen, Janani Ravi #' -#' @description This function adds a new 'Name' column that is comprised of components from -#' Kingdom, Phylum, Genus, and species, as well as the accession +#' @description This function adds a new 'Name' column that is comprised of +#' components from Kingdom, Phylum, Genus, and species, as well as the accession #' #' @param data Data to add name column to #' @param accnum_col Column containing accession numbers @@ -209,6 +212,9 @@ addLeaves2Alignment <- function(aln_file = "", #' @export #' #' @examples +#' \dontrun{ +#' addName(data_frame) +#' } addName <- function(data, accnum_col = "AccNum", spec_col = "Species", lin_col = "Lineage", lin_sep = ">", out_col = "Name") { @@ -272,8 +278,8 @@ addName <- function(data, #' Default is 'pspa.txt' #' @param fa_outpath Character. Path to the written fasta file. #' Default is 'NULL' -#' @param reduced Boolean. If TRUE, the fasta file will contain only one sequence per lineage. -#' Default is 'FALSE' +#' @param reduced Boolean. If TRUE, the fasta file will contain only one +#' sequence per lineage. Default is 'FALSE' #' #' @details The alignment file would need two columns: 1. accession + #' number and 2. alignment. The protein homolog accession to lineage mapping + @@ -283,7 +289,9 @@ addName <- function(data, #' #' @importFrom readr write_file #' -#' @return +#' @return Character string containing the Fasta formatted sequences. +#' If `fa_outpath` is specified, the function also writes the sequences to the +#' Fasta file. #' @export #' #' @examples @@ -325,11 +333,11 @@ convertAlignment2FA <- function(aln_file = "", } #' mapAcc2Name -#' +#' #' @description #' Default rename_fasta() replacement function. Maps an accession number to its name #' -#' @param line he line of a fasta file starting with '>' +#' @param line The line of a fasta file starting with '>' #' @param acc2name Data Table containing a column of accession numbers and a name column #' @param acc_col Name of the column containing Accession numbers #' @param name_col Name of the column containing the names that the accession numbers @@ -339,10 +347,18 @@ convertAlignment2FA <- function(aln_file = "", #' @importFrom stringr str_locate #' @importFrom rlang sym #' -#' @return +#' @return Character string. The modified line from the Fasta file header with +#' the name instead of the accession number. #' @export #' #' @examples +#' \dontrun{ +#' acc2name_table <- data.table(AccNum = c("ACC001", "ACC002"), +#' Name = c("Species A", "Species B")) +#' line <- ">ACC001 some additional info" +#' mapped_line <- mapAcc2Name(line, acc2name_table) +#' mapped_line # Expected output: ">Species A" +#' } mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") { # change to be the name equivalent to an add_names column # Find the first ' ' @@ -368,10 +384,14 @@ mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") { #' @importFrom purrr map #' @importFrom readr read_lines write_lines #' -#' @return +#' @return Character vector containing the modified lines of the Fasta file. #' @export #' #' @examples +#' \dontrun{ +#' rename_fasta("input.fasta", "output.fasta", +#' replacement_function = map_acc2name, acc2name = acc2name_table) +#' } rename_fasta <- function(fa_path, outpath, replacement_function = map_acc2name, ...) { lines <- read_lines(fa_path) @@ -391,30 +411,36 @@ rename_fasta <- function(fa_path, outpath, ################################ ## generateAllAlignments2FA #' generateAllAlignments2FA -#' +#' #' @description #' Adding Leaves to an alignment file w/ accessions #' -#' @author Janani Ravi #' @keywords alignment, accnum, leaves, lineage, species #' @description Adding Leaves to all alignment files w/ accessions & DAs? #' #' @param aln_path Character. Path to alignment files. #' Default is 'here("data/rawdata_aln/")' -#' @param fa_outpath Character. Path to the written fasta file. -#' Default is 'here("data/alns/")'. -#' @param lin_file Character. Path to file. Master protein file with AccNum & lineages. +#' @param fa_outpath Character. Path to file. Master protein file with AccNum & +#' lineages. #' Default is 'here("data/rawdata_tsv/all_semiclean.txt")' -#' @param reduced Boolean. If TRUE, the fasta file will contain only one sequence per lineage. +#' @param lin_file Character. Path to the written fasta file. +#' Default is 'here("data/alns/")'. +#' @param reduced Boolean. If TRUE, the fasta file will contain only one +#' sequence per lineage. #' Default is 'FALSE'. #' -#' @details The alignment files would need two columns separated by spaces: 1. AccNum and 2. alignment. The protein homolog file should have AccNum, Species, Lineages. -#' @note Please refer to the source code if you have alternate + file formats and/or column names. -#' #' @importFrom purrr pmap #' @importFrom stringr str_replace_all #' -#' @return +#' @return NULL. The function saves the output FASTA files to the specified +#' directory. +#' +#' @details The alignment files would need two columns separated by spaces: +#' 1. AccNum and 2. alignment. The protein homolog file should have AccNum, +#' Species, Lineages. +#' @note Please refer to the source code if you have alternate + file formats +#' and/or column names. +#' #' @export #' #' @examples @@ -452,40 +478,43 @@ generateAllAlignments2FA <- function(aln_path = here("data/rawdata_aln/"), # accessions <- c("P12345","Q9UHC1","O15530","Q14624","P0DTD1") # accessions <- rep("ANY95992.1", 201) -#' acc2fa +#' acc2FA #' #' @description -#' converts protein accession numbers to a fasta format. Resulting +#' converts protein accession numbers to a fasta format. Resulting #' fasta file is written to the outpath. -#' +#' #' @author Samuel Chen, Janani Ravi #' @keywords accnum, fasta #' #' @description -#' acc2fa converts protein accession numbers to a fasta format. +#' acc2FA converts protein accession numbers to a fasta format. #' Resulting fasta file is written to the outpath. #' #' -#' @param accessions Character vector containing protein accession numbers to generate fasta sequences for. -#' Function may not work for vectors of length > 10,000 +#' @param accessions Character vector containing protein accession numbers to +#' generate fasta sequences for. Function may not work for vectors of +#' length > 10,000 #' @param outpath [str]. Location where fasta file should be written to. -#' @param plan +#' @param plan Character. The plan to use for processing. Default is "sequential". #' #' @importFrom Biostrings readAAStringSet #' @importFrom future future plan #' @importFrom purrr map #' @importFrom rentrez entrez_fetch #' -#' @return +#' @return A Fasta file is written to the specified `outpath`. #' @export #' #' @examples #' \dontrun{ -#' acc2fa(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"), outpath = "my_proteins.fasta") -#' Entrez:accessions <- rep("ANY95992.1", 201) |> acc2fa(outpath = "entrez.fa") -#' EBI:accessions <- c("P12345", "Q9UHC1", "O15530", "Q14624", "P0DTD1") |> acc2fa(outpath = "ebi.fa") +#' acc2FA(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"), +#' outpath = "my_proteins.fasta") +#' Entrez:accessions <- rep("ANY95992.1", 201) |> acc2FA(outpath = "entrez.fa") +#' EBI:accessions <- c("P12345", "Q9UHC1", "O15530", "Q14624", "P0DTD1") |> +#' acc2FA(outpath = "ebi.fa") #' } -acc2fa <- function(accessions, outpath, plan = "sequential") { +acc2FA <- function(accessions, outpath, plan = "sequential") { # validation stopifnot(length(accessions) > 0) @@ -572,14 +601,23 @@ acc2fa <- function(accessions, outpath, plan = "sequential") { #' @importFrom dplyr filter pull #' @importFrom rlang sym #' -#' @return +#' @return A character vector containing representative accession numbers, +#' one for each distinct observation in the specified 'reduced' column. #' @export #' #' @examples +#' \dontrun{ +#' # Example usage with a data frame called `protein_data` +#' createRepresentativeAccNum <- RepresentativeAccNums(prot_data = protein_data, +#' reduced = "Lineage", +#' accnum_col = "AccNum") +#' representative_accessions +#' } createRepresentativeAccNum <- function(prot_data, reduced = "Lineage", accnum_col = "AccNum") { - # Get Unique reduced column and then bind the AccNums back to get one AccNum per reduced column + # Get Unique reduced column and then bind the AccNums back to get one + # AccNum per reduced column reduced_sym <- sym(reduced) accnum_sym <- sym(accnum_col) @@ -613,8 +651,10 @@ createRepresentativeAccNum <- function(prot_data, #' @author Samuel Chen, Janani Ravi #' #' @param fasta_file Path to the FASTA file to be aligned -#' @param tool Type of alignment tool to use. One of three options: "Muscle", "ClustalO", or "ClustalW" -#' @param outpath Path to write the resulting alignment to as a FASTA file. If NULL, no file is written +#' @param tool Type of alignment tool to use. One of three options: "Muscle", +#' "ClustalO", or "ClustalW" +#' @param outpath Path to write the resulting alignment to as a FASTA file. If +#' NULL, no file is written #' #' @importFrom Biostrings readAAStringSet #' @importFrom msa msaMuscle msaClustalOmega msaClustalW @@ -623,6 +663,12 @@ createRepresentativeAccNum <- function(prot_data, #' @export #' #' @examples +#' \dontrun{ +#' # Example usage +#' aligned_sequences <- alignFasta("path/to/sequences.fasta", +#' tool = "ClustalO", outpath = "path/to/aligned_sequences.fasta") +#' aligned_sequences +#' } alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) { fasta <- readAAStringSet(fasta_file) @@ -653,10 +699,15 @@ alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) { #' @importFrom Biostrings unmasked #' @importFrom readr write_file #' -#' @return +#' @return Character string of the FASTA content that was written to the file. #' @export #' #' @examples +#' \dontrun{ +#' # Example usage +#' alignment <- alignFasta("path/to/sequences.fasta") +#' writeMSA_AA2FA(alignment, "path/to/aligned_sequences.fasta") +#' } writeMSA_AA2FA <- function(alignment, outpath) { l <- length(rownames(alignment)) fasta <- "" @@ -672,15 +723,21 @@ writeMSA_AA2FA <- function(alignment, outpath) { #' getAccNumFromFA #' -#' @param fasta_file +#' @param fasta_file Character. Path to the FASTA file from which +#' accession numbers will be extracted. #' #' @importFrom readr read_file #' @importFrom stringi stri_extract_all_regex #' -#' @return +#' @return A character vector containing the extracted accession numbers. #' @export #' #' @examples +#' \dontrun{ +#' # Example usage +#' accnums <- getAccNumFromFA("path/to/sequences.fasta") +#' accnums +#' } getAccNumFromFA <- function(fasta_file) { txt <- read_file(fasta_file) accnums <- stringi::stri_extract_all_regex(fasta_file, "(?<=>)[\\w,.]+")[[1]] diff --git a/R/reverse_operons.R b/R/reverse_operons.R index a2570e8d..f250e8c0 100755 --- a/R/reverse_operons.R +++ b/R/reverse_operons.R @@ -3,14 +3,26 @@ # Modified by Janani Ravi and Samuel Chen -#' straightenOperonSeq +#' straightenOperonSeq: Reverse Equalities in Genomic Context #' -#' @param prot +#' @description +#' This function processes the genomic context strings (GenContext) and reverses +#' directional signs based on the presence of an equal sign ("="). +#' +#' @param prot [vector] A vector of genomic context strings to be processed. +#' +#' @return [vector] A vector of the same length as the input, where each genomic +#' element is annotated with either a forward ("->") or reverse ("<-") direction, +#' depending on its position relative to the "=" symbols. #' -#' @return #' @export #' #' @examples +#' # Example input: Genomic context with directional symbols and an asterisk +#' genomic_context <- c("A", "B", "*", "C", "D", "=", "E", "F") +#' straightenOperonSeq(genomic_context) +#' +#' # Output: "A->", "B->", "*", "<-C", "<-D", "=", "E->", "F->" straightenOperonSeq <- function(prot) { w <- prot # $GenContext.orig # was 'x' @@ -57,14 +69,31 @@ straightenOperonSeq <- function(prot) { ## The function to reverse operons -#' reverseOperonSeq +#' reverseOperon: Reverse the Direction of Operons in Genomic ContextSeq +#' +#' @description +#' This function processes a genomic context data frame to reverse the direction +#' of operons based on specific patterns in the GenContext column. It handles +#' elements represented by ">" and "<" and restructures the genomic context by +#' flipping the direction of operons while preserving the relationships +#' indicated by "=". +#' +#' @param prot [data.frame] A data frame containing at least a column named +#' 'GenContext', which represents the genomic contexts that need to be reversed. #' -#' @param prot +#' @return [data.frame] The input data frame with the 'GenContext' column updated t +#' o reflect the reversed operons. #' -#' @return #' @export #' #' @examples +#' \dontrun{ +#' # Example genomic context data frame +#' ## Rework example data, does not pass R-CMD Check +#' prot <- data.frame(GenContext = c("A>B", "CI")) +#' reversed_prot <- reverseOperonSeq(prot) +#' reversed_prot +#' } reverseOperonSeq <- function(prot) { gencontext <- prot$GenContext diff --git a/man/GCA2Lineage.Rd b/man/GCA2Lineage.Rd index 9a2a7a30..6b9011c7 100644 --- a/man/GCA2Lineage.Rd +++ b/man/GCA2Lineage.Rd @@ -21,7 +21,13 @@ This file can be generated using the "downloadAssemblySummary()" function} (taxid to lineage mapping). This file can be generated using the "createLineageLookup()" function} -\item{acc_col}{} +\item{acc_col}{Character. The name of the column in \code{prot_data} containing +accession numbers. Default is "AccNum".} +} +\value{ +A dataframe containing the merged information of GCA_IDs, TaxIDs, +and their corresponding lineage up to the phylum level. The dataframe +will include information from the input \code{prot_data} and lineage data. } \description{ Function to map GCA_ID to TaxID, and TaxID to Lineage @@ -29,6 +35,13 @@ Function to map GCA_ID to TaxID, and TaxID to Lineage \note{ Currently configured to have at most kingdom and phylum } +\examples{ +\dontrun{ +result <- GCA2Lineage(prot_data = my_prot_data, + assembly_path = "path/to/assembly_summary.txt", + lineagelookup_path = "path/to/lineage_lookup.tsv") +} +} \author{ Samuel Chen, Janani Ravi } diff --git a/man/IPG2Lineage.Rd b/man/IPG2Lineage.Rd index 118812ab..eef47027 100644 --- a/man/IPG2Lineage.Rd +++ b/man/IPG2Lineage.Rd @@ -27,6 +27,10 @@ IPG2Lineage( ipg database. The protein accession in 'accessions' should be contained in this file} +\item{refseq_assembly_path}{String. Path to the RefSeq assembly summary file.} + +\item{genbank_assembly_path}{String. Path to the GenBank assembly summary file.} + \item{lineagelookup_path}{String of the path to the lineage lookup file (taxid to lineage mapping). This file can be generated using the "createLineageLookup()" function} @@ -37,6 +41,9 @@ This file can be generated using the \link[MolEvolvR]{downloadAssemblySummary} f \value{ A \code{data.table} with the lineage information for the provided protein accessions. + +A data table containing protein accessions along with their +corresponding TaxIDs and lineage information. } \description{ Takes the resulting file @@ -50,6 +57,15 @@ append lineage, and taxid columns IPG2Lineage() } +\dontrun{ +lins <- IPG2Lineage( + accessions = c("P12345", "Q67890"), + ipg_file = "path/to/ipg_results.txt", + refseq_assembly_path = "path/to/refseq_assembly_summary.txt", + genbank_assembly_path = "path/to/genbank_assembly_summary.txt", + lineagelookup_path = "path/to/lineage_lookup.tsv" +) +} } \author{ Samuel Chen, Janani Ravi diff --git a/man/acc2FA.Rd b/man/acc2FA.Rd new file mode 100644 index 00000000..ae7101d7 --- /dev/null +++ b/man/acc2FA.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/CHANGED-pre-msa-tree.R, R/pre-msa-tree.R +\name{acc2FA} +\alias{acc2FA} +\title{acc2FA} +\usage{ +acc2FA(accessions, outpath, plan = "sequential") + +acc2FA(accessions, outpath, plan = "sequential") +} +\arguments{ +\item{accessions}{Character vector containing protein accession numbers to +generate fasta sequences for. Function may not work for vectors of +length > 10,000} + +\item{outpath}{\link{str}. Location where fasta file should be written to.} + +\item{plan}{Character. The plan to use for processing. Default is "sequential".} +} +\value{ +A logical value indicating whether the retrieval and conversion were +successful. Returns \code{TRUE} if successful and \code{FALSE} otherwise. + +A Fasta file is written to the specified \code{outpath}. +} +\description{ +converts protein accession numbers to a fasta format. Resulting +fasta file is written to the outpath. + +converts protein accession numbers to a fasta format. Resulting +fasta file is written to the outpath. + +acc2FA converts protein accession numbers to a fasta format. +Resulting fasta file is written to the outpath. +} +\examples{ +\dontrun{ +acc2FA(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"), +outpath = "my_proteins.fasta") +Entrez:accessions <- rep("ANY95992.1", 201) |> acc2FA(outpath = "entrez.fa") +EBI:accessions <- c("P12345", "Q9UHC1", +"O15530", "Q14624", "P0DTD1") |> acc2FA(outpath = "ebi.fa") +} +\dontrun{ +acc2FA(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"), +outpath = "my_proteins.fasta") +Entrez:accessions <- rep("ANY95992.1", 201) |> acc2FA(outpath = "entrez.fa") +EBI:accessions <- c("P12345", "Q9UHC1", "O15530", "Q14624", "P0DTD1") |> +acc2FA(outpath = "ebi.fa") +} +} +\author{ +Samuel Chen, Janani Ravi +} +\keyword{accnum,} +\keyword{fasta} diff --git a/man/acc2Lineage.Rd b/man/acc2Lineage.Rd index a46b6f20..ce499592 100644 --- a/man/acc2Lineage.Rd +++ b/man/acc2Lineage.Rd @@ -32,11 +32,16 @@ This file can be generated using the "downloadAssemblySummary()" function} \item{ipgout_path}{Path to write the results of the efetch run of the accessions on the ipg database. If NULL, the file will not be written. Defaults to NULL} -\item{plan}{} +\item{plan}{Character. Specifies the execution plan for parallel processing. +Default is "multicore".} } \value{ A \code{data.table} that contains the lineage information, mapping protein accessions to their tax IDs and lineages. + +A dataframe containing lineage information mapped to the given protein +accessions. The dataframe includes relevant columns such as TaxID, GCA_ID, +Protein, Protein Name, Species, and Lineage. } \description{ This function combines 'efetchIPG()' and 'IPG2Lineage()' to map a set @@ -51,6 +56,14 @@ of protein accessions to their assembly (GCA_ID), tax ID, and lineage. \dontrun{ acc2Lineage() } +\dontrun{ +lineage_data <- acc2Lineage( + accessions = c("P12345", "Q67890"), + assembly_path = "path/to/assembly_summary.txt", + lineagelookup_path = "path/to/lineage_lookup.tsv", + ipgout_path = "path/to/output.txt" +) +} } \author{ Samuel Chen, Janani Ravi diff --git a/man/acc2fa.Rd b/man/acc2fa.Rd deleted file mode 100644 index 3e7a756d..00000000 --- a/man/acc2fa.Rd +++ /dev/null @@ -1,35 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pre-msa-tree.R -\name{acc2fa} -\alias{acc2fa} -\title{acc2fa} -\usage{ -acc2fa(accessions, outpath, plan = "sequential") -} -\arguments{ -\item{accessions}{Character vector containing protein accession numbers to generate fasta sequences for. -Function may not work for vectors of length > 10,000} - -\item{outpath}{\link{str}. Location where fasta file should be written to.} - -\item{plan}{} -} -\description{ -converts protein accession numbers to a fasta format. Resulting -fasta file is written to the outpath. - -acc2fa converts protein accession numbers to a fasta format. -Resulting fasta file is written to the outpath. -} -\examples{ -\dontrun{ -acc2fa(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"), outpath = "my_proteins.fasta") -Entrez:accessions <- rep("ANY95992.1", 201) |> acc2fa(outpath = "entrez.fa") -EBI:accessions <- c("P12345", "Q9UHC1", "O15530", "Q14624", "P0DTD1") |> acc2fa(outpath = "ebi.fa") -} -} -\author{ -Samuel Chen, Janani Ravi -} -\keyword{accnum,} -\keyword{fasta} diff --git a/man/addLeaves2Alignment.Rd b/man/addLeaves2Alignment.Rd index d00e6df7..85edc798 100644 --- a/man/addLeaves2Alignment.Rd +++ b/man/addLeaves2Alignment.Rd @@ -28,6 +28,13 @@ Default is 'pspa.txt'} \item{reduced}{Boolean. If TRUE, a reduced data frame will be generated with only one sequence per lineage. Default is FALSE.} } +\value{ +A data frame containing the enriched alignment data with lineage +information. + +A data frame containing the combined alignment and lineage +information. +} \description{ Adding Leaves to an alignment file w/ accessions Genomic Contexts vs Domain Architectures. diff --git a/man/addLineage.Rd b/man/addLineage.Rd index ab02a5ab..e2363463 100644 --- a/man/addLineage.Rd +++ b/man/addLineage.Rd @@ -23,26 +23,30 @@ addLineage( ) } \arguments{ -\item{df}{A \code{data.frame} containing the input data. One column must contain -the accession numbers.} +\item{df}{Dataframe containing accession numbers. The dataframe should +have a column specified by \code{acc_col} that contains these accession numbers.} -\item{acc_col}{A string specifying the column name in \code{df} that holds the -accession numbers. Defaults to \code{"AccNum"}.} +\item{acc_col}{Character. The name of the column in \code{df} containing +accession numbers. Default is "AccNum".} -\item{assembly_path}{A string specifying the path to the \code{assembly_summary.txt} -file. This file contains metadata about assemblies.} +\item{assembly_path}{String. The path to the assembly summary file generated +using the \code{downloadAssemblySummary()} function.} -\item{lineagelookup_path}{A string specifying the path to the lineage lookup -file, which contains a mapping from tax IDs to their corresponding lineages.} +\item{lineagelookup_path}{String. The path to the lineage lookup file (taxid +to lineage mapping) generated using the \code{create_lineage_lookup()} function.} -\item{ipgout_path}{(Optional) A string specifying the path where IPG database -fetch results will be saved. If \code{NULL}, the results are not written to a file.} +\item{ipgout_path}{String. Optional path to save intermediate output files. +Default is NULL.} -\item{plan}{} +\item{plan}{Character. Specifies the execution plan for parallel processing. +Default is "multicore".} } \value{ A \code{data.frame} that combines the original \code{df} with the lineage information. + +A dataframe that combines the original dataframe \code{df} with lineage +information retrieved based on the provided accession numbers. } \description{ addLineage @@ -53,4 +57,10 @@ addLineage \dontrun{ addLineage() } +\dontrun{ +enriched_df <- addLineage(df = my_data, + acc_col = "AccNum", + assembly_path = "path/to/assembly_summary.txt", + lineagelookup_path = "path/to/lineage_lookup.tsv") +} } diff --git a/man/addName.Rd b/man/addName.Rd index 6f171456..b681f349 100644 --- a/man/addName.Rd +++ b/man/addName.Rd @@ -45,8 +45,21 @@ Original data with a 'Name' column This function adds a new 'Name' column that is comprised of components from Kingdom, Phylum, Genus, and species, as well as the accession -This function adds a new 'Name' column that is comprised of components from -Kingdom, Phylum, Genus, and species, as well as the accession +This function adds a new 'Name' column that is comprised of +components from Kingdom, Phylum, Genus, and species, as well as the accession +} +\examples{ +# Example usage of the addName function +data <- data.frame( + AccNum = c("ACC123", "ACC456"), + Species = c("Homo sapiens", "Mus musculus"), + Lineage = c("Eukaryota>Chordata", "Eukaryota>Chordata") +) +enriched_data <- addName(data) +enriched_data +\dontrun{ +addName(data_frame) +} } \author{ Samuel Chen, Janani Ravi diff --git a/man/addTaxID.Rd b/man/addTaxID.Rd index d2fe139d..9e68321c 100644 --- a/man/addTaxID.Rd +++ b/man/addTaxID.Rd @@ -7,8 +7,26 @@ addTaxID(data, acc_col = "AccNum", version = T) } \arguments{ -\item{version}{} +\item{data}{A data frame or data table containing protein accession numbers.} + +\item{acc_col}{A string specifying the column name in \code{data} that contains +the accession numbers. Defaults to "AccNum".} + +\item{version}{A logical indicating whether to remove the last two characters +from the accession numbers for TaxID retrieval. Defaults to TRUE.} +} +\value{ +A data table that includes the original data along with a new column +containing the corresponding TaxIDs. } \description{ addTaxID } +\examples{ +\dontrun{ +# Create a sample data table with accession numbers +sample_data <- data.table(AccNum = c("ABC123.1", "XYZ456.1", "LMN789.2")) +enriched_data <- addTaxID(sample_data, acc_col = "AccNum", version = TRUE) +enriched_data +} +} diff --git a/man/alignFasta.Rd b/man/alignFasta.Rd index 02a3026b..61e880ab 100644 --- a/man/alignFasta.Rd +++ b/man/alignFasta.Rd @@ -11,9 +11,11 @@ alignFasta(fasta_file, tool = "Muscle", outpath = NULL) \arguments{ \item{fasta_file}{Path to the FASTA file to be aligned} -\item{tool}{Type of alignment tool to use. One of three options: "Muscle", "ClustalO", or "ClustalW"} +\item{tool}{Type of alignment tool to use. One of three options: "Muscle", +"ClustalO", or "ClustalW"} -\item{outpath}{Path to write the resulting alignment to as a FASTA file. If NULL, no file is written} +\item{outpath}{Path to write the resulting alignment to as a FASTA file. If +NULL, no file is written} } \value{ aligned fasta sequence as a MsaAAMultipleAlignment object @@ -25,6 +27,18 @@ Perform a Multiple Sequence Alignment on a FASTA file. Perform a Multiple Sequence Alignment on a FASTA file. } +\examples{ +\dontrun{ +aligned_sequences <- alignFasta("my_sequences.fasta", +tool = "Muscle", outpath = "aligned_output.fasta") +} +\dontrun{ +# Example usage +aligned_sequences <- alignFasta("path/to/sequences.fasta", +tool = "ClustalO", outpath = "path/to/aligned_sequences.fasta") +aligned_sequences +} +} \author{ Samuel Chen, Janani Ravi } diff --git a/man/cleanDomainArchitecture.Rd b/man/cleanDomainArchitecture.Rd index 887b5388..f12f1083 100644 --- a/man/cleanDomainArchitecture.Rd +++ b/man/cleanDomainArchitecture.Rd @@ -19,21 +19,33 @@ cleanDomainArchitecture( \arguments{ \item{prot}{A data frame containing a 'DomArch' column} +\item{old}{The name of the original column containing domain architecture. +Defaults to "DomArch.orig".} + +\item{new}{The name of the cleaned column to be created. Defaults to +"DomArch".} + \item{domains_keep}{A data frame containing the domain names to be retained.} -\item{domains_rename}{A data frame containing the domain names to be replaced in a column 'old' and the +\item{domains_rename}{A data frame containing the domain names to be replaced +in a column 'old' and the corresponding replacement values in a column 'new'.} -\item{condenseRepeatedDomains}{Boolean. If TRUE, repeated domains in 'DomArch' are condensed. Default is TRUE.} +\item{condenseRepeatedDomains}{Boolean. If TRUE, repeated domains in +'DomArch' are condensed. Default is TRUE.} -\item{removeTails}{Boolean. If TRUE, 'ClustName' will be filtered based on domains to keep/remove. Default is FALSE.} +\item{removeTails}{Boolean. If TRUE, 'ClustName' will be filtered based on +domains to keep/remove. Default is FALSE.} -\item{removeEmptyRows}{Boolean. If TRUE, rows with empty/unnecessary values in 'DomArch' are removed. Default is FALSE.} +\item{removeEmptyRows}{Boolean. If TRUE, rows with empty/unnecessary values +in 'DomArch' are removed. Default is FALSE.} -\item{domains_ignore}{A data frame containing the domain names to be removed in a column called 'domains'} +\item{domains_ignore}{A data frame containing the domain names to be removed +in a column called 'domains'} } \value{ -The original data frame is returned with the clean DomArchs column and the old domains in the DomArchs.old column. +The original data frame is returned with the clean DomArchs column +and the old domains in the DomArchs.old column. } \description{ Cleanup Domain Architectures @@ -46,6 +58,7 @@ The original data frame is returned with the clean DomArchs column and the old d } \examples{ \dontrun{ -cleanDomainArchitecture(prot, TRUE, FALSE, domains_keep, domains_rename, domains_ignore = NULL) +cleanDomainArchitecture(prot, TRUE, FALSE, +omains_keep, domains_rename, domains_ignore = NULL) } } diff --git a/man/cleanFAHeaders.Rd b/man/cleanFAHeaders.Rd index e9ad9b30..e93d0ca3 100644 --- a/man/cleanFAHeaders.Rd +++ b/man/cleanFAHeaders.Rd @@ -7,7 +7,9 @@ cleanFAHeaders(fasta) } \arguments{ -\item{fasta}{} +\item{fasta}{An \link{XStringSet} object representing the sequences from a +FASTA file. The sequence names (headers) will be adjusted for uniqueness +and sanitized.} } \value{ \link{XStringSet} fasta with adjusted names (headers) diff --git a/man/cleanGeneDescription.Rd b/man/cleanGeneDescription.Rd index f98a25d4..3d106ae6 100644 --- a/man/cleanGeneDescription.Rd +++ b/man/cleanGeneDescription.Rd @@ -7,7 +7,10 @@ cleanGeneDescription(prot, column) } \arguments{ -\item{column}{} +\item{prot}{A data frame containing the gene descriptions.} + +\item{column}{The name of the column from which gene descriptions are pulled +for cleanup.} } \value{ Return trailing period that occurs in GeneDesc column diff --git a/man/cleanLineage.Rd b/man/cleanLineage.Rd index adcea312..071b37d2 100644 --- a/man/cleanLineage.Rd +++ b/man/cleanLineage.Rd @@ -7,10 +7,15 @@ cleanLineage(prot, lins_rename) } \arguments{ -\item{lins_rename}{} +\item{prot}{A data frame containing a 'Lineage' column that needs to be +cleaned up.} + +\item{lins_rename}{A data frame with two columns: 'old' containing terms +to be replaced and 'new' containing the corresponding replacement terms.} } \value{ -Describe return, in detail +The original data frame with the 'Lineage' column updated based on +the provided replacements. } \description{ Cleanup Lineage diff --git a/man/cleanSpecies.Rd b/man/cleanSpecies.Rd index 82b5444c..93fc2e05 100644 --- a/man/cleanSpecies.Rd +++ b/man/cleanSpecies.Rd @@ -13,7 +13,7 @@ cleanSpecies(prot, removeEmptyRows = FALSE) Default is false.} } \value{ -Describe return, in detail +The original data frame with Species cleaned. } \description{ Cleanup Species diff --git a/man/combineFiles.Rd b/man/combineFiles.Rd index 3b56b923..81464fa6 100644 --- a/man/combineFiles.Rd +++ b/man/combineFiles.Rd @@ -13,16 +13,34 @@ combineFiles( ) } \arguments{ -\item{inpath}{String of 'master' path where the files reside (recursive=T)} +\item{inpath}{Character. The master directory path where the files reside. +The search is recursive (i.e., it will look in subdirectories as well).} -\item{pattern}{Character vector containing search pattern for files} +\item{pattern}{Character. A search pattern to identify files to be combined. +Default is "*full_analysis.tsv".} -\item{col_names}{Takes logical T/F arguments OR column names vector; -usage similar to col_names parameter in \code{readr::read_delim}} +\item{delim}{Character. The delimiter used in the input files. +Default is tab ("\t").} + +\item{skip}{Integer. The number of lines to skip at the beginning of each file. +Default is 0.} + +\item{col_names}{Logical or character vector. If TRUE, the first row of each file +is treated as column names. Alternatively, a character vector can +be provided to specify custom column names.} +} +\value{ +A data frame containing the combined contents of all matched files. +Each row will include a new column "ByFile" indicating the source file of the data. } \description{ Download the combined assembly summaries of genbank and refseq } +\examples{ +\dontrun{ +combined_data <- combineFiles(inpath = "../molevol_data/project_data/phage_defense/") +} +} \author{ Janani Ravi } diff --git a/man/combineFullAnalysis.Rd b/man/combineFullAnalysis.Rd index 35925e86..46a4ba63 100644 --- a/man/combineFullAnalysis.Rd +++ b/man/combineFullAnalysis.Rd @@ -7,8 +7,22 @@ combineFullAnalysis(inpath, ret = FALSE) } \arguments{ -\item{ret}{} +\item{inpath}{Character. The path to the directory containing the +\code{.full_analysis.tsv} files to be combined.} + +\item{ret}{Logical. If TRUE, the function will return the combined data frame. +Default is FALSE, meaning it will only write the file and not return the data.} +} +\value{ +If \code{ret} is TRUE, a data frame containing the combined data from all +input files. If \code{ret} is FALSE, the function writes the combined data to a +TSV file named \code{cln_combined.tsv} in the specified directory and returns NULL. } \description{ Combining full_analysis files } +\examples{ +\dontrun{ +combined_data <- combineFullAnalysis("path/to/full_analysis/files", ret = TRUE) +} +} diff --git a/man/combineIPR.Rd b/man/combineIPR.Rd index 035c4274..6f539e37 100644 --- a/man/combineIPR.Rd +++ b/man/combineIPR.Rd @@ -7,8 +7,22 @@ combineIPR(inpath, ret = FALSE) } \arguments{ -\item{ret}{} +\item{inpath}{Character. The path to the directory containing the +\code{.iprscan_cln.tsv} files to be combined.} + +\item{ret}{Logical. If TRUE, the function will return the combined data frame. +Default is FALSE, meaning it will only write the file and not return the data.} +} +\value{ +If \code{ret} is TRUE, a data frame containing the combined data from all +input files. If \code{ret} is FALSE, the function writes the combined data to a +TSV file named \code{ipr_combined.tsv} in the specified directory and returns NULL. } \description{ Combining clean ipr files } +\examples{ +\dontrun{ +combineIPR <- combine_ipr("path/to/ipr/files", ret = TRUE) +} +} diff --git a/man/condenseRepeatedDomains.Rd b/man/condenseRepeatedDomains.Rd index 3b239129..ee51a544 100644 --- a/man/condenseRepeatedDomains.Rd +++ b/man/condenseRepeatedDomains.Rd @@ -14,7 +14,7 @@ condenseRepeatedDomains(prot, by_column = "DomArch", excluded_prots = c()) \item{excluded_prots}{Vector of strings that condenseRepeatedDomains should not reduce to (s). Defaults to c()} } \value{ -Describe return, in detail +A data frame with condensed repeated domains in the specified column. } \description{ Condense repeated domains diff --git a/man/convert2TitleCase.Rd b/man/convert2TitleCase.Rd index 72619285..4769efea 100644 --- a/man/convert2TitleCase.Rd +++ b/man/convert2TitleCase.Rd @@ -15,12 +15,24 @@ to_titlecase(text, delimitter) \item{y}{Delimitter. Default is space (" ").} } +\value{ +Character vector with the input strings converted to title case. + +A character vector in title case. +} \description{ Translate string to Title Case w/ delimitter. Translate string to Title Case w/ delimitter. Changing case to 'Title Case' } +\examples{ +# Convert a single string to title case +convert2TitleCase("hello world") # Returns "Hello World" + +convert2TitleCase("hello world") +convert2TitleCase("this is a test", "_") +} \seealso{ chartr, toupper, and tolower. diff --git a/man/convertAlignment2FA.Rd b/man/convertAlignment2FA.Rd index 8e9ceb94..528868c0 100644 --- a/man/convertAlignment2FA.Rd +++ b/man/convertAlignment2FA.Rd @@ -30,8 +30,17 @@ Default is 'pspa.txt'} \item{fa_outpath}{Character. Path to the written fasta file. Default is 'NULL'} -\item{reduced}{Boolean. If TRUE, the fasta file will contain only one sequence per lineage. -Default is 'FALSE'} +\item{reduced}{Boolean. If TRUE, the fasta file will contain only one +sequence per lineage. Default is 'FALSE'} +} +\value{ +A character string representing the FASTA formatted sequences. +If \code{fa_outpath} is provided, the FASTA will also be saved to the specified +file. + +Character string containing the Fasta formatted sequences. +If \code{fa_outpath} is specified, the function also writes the sequences to the +Fasta file. } \description{ Adding Leaves to an alignment file w/ accessions diff --git a/man/createBinaryDomainNetwork.Rd b/man/createBinaryDomainNetwork.Rd index 4f0bdc5a..e0450e0e 100644 --- a/man/createBinaryDomainNetwork.Rd +++ b/man/createBinaryDomainNetwork.Rd @@ -19,20 +19,32 @@ createBinaryDomainNetwork( \arguments{ \item{prot}{A data frame that contains the column 'DomArch'.} -\item{column}{Name of column containing Domain architecture from which nodes and edges are generated.} +\item{column}{Name of column containing Domain architecture from which nodes +and edges are generated.} -\item{cutoff}{Integer. Only use domains that occur at or above the cutoff for total counts if cutoff_type is "Total Count". -Only use domains that appear in cutoff or greater lineages if cutoff_type is Lineage.} +\item{domains_of_interest}{Character vector specifying the domains of interest.} + +\item{cutoff}{Integer. Only use domains that occur at or above the cutoff for +total counts if cutoff_type is "Total Count". +Only use domains that appear in cutoff or greater lineages if cutoff_type is +Lineage.} \item{layout}{Character. Layout type to be used for the network. Options are: \itemize{\item "grid" \item "circle" \item "random" \item "auto"}} -\item{query_color}{Color that the nodes of the domains in the domains_of_interest vector are colored} +\item{query_color}{Color that the nodes of the domains in the +domains_of_interest vector are colored} + +\item{partner_color}{Color that the nodes that are not part of the +domains_of_interest vector are colored} -\item{partner_color}{Color that the nodes that are not part of the domains_of_interest vector are colored} +\item{border_color}{Color for the borders of the nodes.} \item{IsDirected}{Is the network directed? Set to false to eliminate arrows} } +\value{ +A network visualization of domain architectures. +} \description{ This function creates a domain network from the 'DomArch' column. @@ -42,6 +54,6 @@ A network of domains is returned based on shared domain architectures. } \examples{ \dontrun{ -createDomainNetwork(pspa) +createBinaryDomainNetwork(pspa) } } diff --git a/man/createDomainNetwork.Rd b/man/createDomainNetwork.Rd index 1588af17..de1de9e1 100644 --- a/man/createDomainNetwork.Rd +++ b/man/createDomainNetwork.Rd @@ -16,15 +16,24 @@ createDomainNetwork( \arguments{ \item{prot}{A data frame that contains the column 'DomArch'.} -\item{column}{Name of column containing Domain architecture from which nodes and edges are generated.} +\item{column}{Name of column containing Domain architecture from which nodes +and edges are generated.} -\item{cutoff}{Integer. Only use domains that occur at or above the cutoff for total counts if cutoff_type is "Total Count". -Only use domains that appear in cutoff or greater lineages if cutoff_type is Lineage.} +\item{domains_of_interest}{Character vector specifying domains of interest.} + +\item{cutoff}{Integer. Only use domains that occur at or above the cutoff for +total counts if cutoff_type is "Total Count". +Only use domains that appear in cutoff or greater lineages if cutoff_type is +Lineage.} \item{layout}{Character. Layout type to be used for the network. Options are: \itemize{\item "grid" \item "circle" \item "random" \item "auto"}} -\item{query_color}{} +\item{query_color}{Character. Color to represent the queried domain in the +network.} +} +\value{ +A network visualization of domain architectures. } \description{ This function creates a domain network from the 'DomArch' column. diff --git a/man/createGenomicContextNetwork.Rd b/man/createGenomicContextNetwork.Rd index ac6deb84..7b23700b 100644 --- a/man/createGenomicContextNetwork.Rd +++ b/man/createGenomicContextNetwork.Rd @@ -18,15 +18,20 @@ createGenomicContextNetwork( \item{domains_of_interest}{Character vector of domains of interest.} -\item{column}{Name of column containing Genomic Context from which nodes and edges are generated.} +\item{column}{Name of column containing Genomic Context from which nodes and +edges are generated.} -\item{cutoff}{Integer. Only use GenContexts that occur at or above the cutoff percentage for total count} +\item{cutoff}{Integer. Only use GenContexts that occur at or above the cutoff +percentage for total count} \item{layout}{Character. Layout type to be used for the network. Options are: \itemize{\item "grid" \item "circle" \item "random" \item "auto" \item "nice"}} \item{directed}{Is the network directed?} } +\value{ +A plot of the genomic context network. +} \description{ This function creates a Genomic Context network from the 'GenContext' column. @@ -34,6 +39,6 @@ A network of Genomic Context is returned. } \examples{ \dontrun{ -gc_directed_network(pspa, column = "GenContex", cutoff = 55) +gc_directed_network(pspa, column = "GenContext", cutoff = 55) } } diff --git a/man/createLineageLookup.Rd b/man/createLineageLookup.Rd index 132019ce..694760e6 100644 --- a/man/createLineageLookup.Rd +++ b/man/createLineageLookup.Rd @@ -11,20 +11,31 @@ createLineageLookup( ) } \arguments{ -\item{lineage_file}{Path to the rankedlineage.dmp file containing taxid's and their -corresponding taxonomic rank. rankedlineage.dmp can be downloaded at +\item{lineage_file}{Path to the rankedlineage.dmp file containing taxid's +and their corresponding taxonomic rank. rankedlineage.dmp can be downloaded at https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/} \item{outfile}{File the resulting lineage lookup table should be written to} -\item{taxonomic_rank}{The upperbound of taxonomic rank that the lineage includes. The lineaege will -include superkingdom>...>taxonomic_rank. +\item{taxonomic_rank}{The upperbound of taxonomic rank that the lineage +includes. The lineaege will include superkingdom>...>taxonomic_rank. Choices include: "supperkingdom", "phylum", "class","order", "family", "genus", and "species"} } +\value{ +A tibble containing the tax IDs and their respective lineages up to +the specified taxonomic rank, saved as a tab-separated file. +} \description{ Create a look up table that goes from TaxID, to Lineage } +\examples{ +\dontrun{ +createLineageLookup(lineage_file = "data/rankedlineage.dmp", + outfile = "data/lineage_lookup.tsv", + taxonomic_rank = "family") +} +} \author{ Samuel Chen } diff --git a/man/createMSA_Kalign.Rd b/man/createMSA_Kalign.Rd index 946f04ae..04f975d3 100644 --- a/man/createMSA_Kalign.Rd +++ b/man/createMSA_Kalign.Rd @@ -7,8 +7,21 @@ createMSA_Kalign(fa_file = "", outfile = "") } \arguments{ -\item{outfile}{} +\item{fa_file}{Character. The path to the input FASTA file containing protein +sequences.} + +\item{outfile}{Character. The path to the output file where the alignment +will be saved.} +} +\value{ +A list containing the alignment object and the output file path. } \description{ Function to generate MSA using kalign } +\examples{ +\dontrun{ +createMSA_Kalign(fa_file = "path/to/sequences.fasta", + outfile = "path/to/alignment.txt") +} +} diff --git a/man/createMSA_PDF.Rd b/man/createMSA_PDF.Rd index 7cd7516a..cb1e1146 100644 --- a/man/createMSA_PDF.Rd +++ b/man/createMSA_PDF.Rd @@ -23,6 +23,9 @@ Default is NULL. If value is NULL, the entire multiple sequence alignment is pri \item{upperbound}{Numeric. The column that determines the ending location of the MSA. Default is NULL. If value is NULL, the entire multiple sequence alignment is printed.} } +\value{ +A PDF file containing the multiple sequence alignment. +} \description{ Generates a multiple sequence alignment from a fasta file @@ -31,6 +34,9 @@ a pdf } \examples{ \dontrun{ -createMSA_PDF() +createMSA_PDF(fasta_path = "path/to/your/file.fasta", + out_path = "path/to/output/alignment.pdf", + lowerbound = 10, + upperbound = 200) } } diff --git a/man/createRepresentativeAccNum.Rd b/man/createRepresentativeAccNum.Rd index 3bd20522..53902940 100644 --- a/man/createRepresentativeAccNum.Rd +++ b/man/createRepresentativeAccNum.Rd @@ -25,11 +25,31 @@ One accession number will be assigned for each of these observations} \item{accnum_col}{Column from prot_data that contains Accession Numbers} } +\value{ +A character vector containing one Accession number per distinct +observation from the specified reduced column. + +A character vector containing representative accession numbers, +one for each distinct observation in the specified 'reduced' column. +} \description{ -Function to generate a vector of one Accession number per distinct observation from 'reduced' column +Function to generate a vector of one Accession number per distinct +observation from 'reduced' column Function to generate a vector of one Accession number per distinct observation from 'reduced' column } +\examples{ +\dontrun{ +createRepresentativeAccNum(prot) +} +\dontrun{ +# Example usage with a data frame called `protein_data` +createRepresentativeAccNum <- RepresentativeAccNums(prot_data = protein_data, + reduced = "Lineage", + accnum_col = "AccNum") +representative_accessions +} +} \author{ Samuel Chen, Janani Ravi } diff --git a/man/createUndirectedGenomicContextNetwork.Rd b/man/createUndirectedGenomicContextNetwork.Rd index b74da141..e875b328 100644 --- a/man/createUndirectedGenomicContextNetwork.Rd +++ b/man/createUndirectedGenomicContextNetwork.Rd @@ -16,18 +16,29 @@ createUndirectedGenomicContextNetwork( \arguments{ \item{prot}{A data frame that contains the column 'DomArch'.} -\item{column}{Name of column containing Domain architecture from which nodes and edges are generated.} +\item{column}{Name of column containing Domain architecture from which nodes +and edges are generated.} -\item{cutoff_type}{Character. Used to determine how data should be filtered. Either -\itemize{\item "Lineage" to filter domains based off how many lineages the Domain architecture appears in -\item "Total Count" to filter off the total amount of times a domain architecture occurs }} +\item{domains_of_interest}{Character vector specifying the domains of interest.} -\item{cutoff}{Integer. Only use domains that occur at or above the cutoff for total counts if cutoff_type is "Total Count". -Only use domains that appear in cutoff or greater lineages if cutoff_type is Lineage.} +\item{cutoff_type}{Character. Used to determine how data should be filtered. +Either +\itemize{\item "Lineage" to filter domains based off how many lineages the +Domain architecture appears in +\item "Total Count" to filter off the total amount of times a +domain architecture occurs }} + +\item{cutoff}{Integer. Only use domains that occur at or above the cutoff +for total counts if cutoff_type is "Total Count". +Only use domains that appear in cutoff or greater lineages if cutoff_type is +Lineage.} \item{layout}{Character. Layout type to be used for the network. Options are: \itemize{\item "grid" \item "circle" \item "random" \item "auto"}} } +\value{ +A plot of the domain architecture network. +} \description{ This function creates a domain network from the 'DomArch' column. @@ -35,6 +46,8 @@ A network of domains is returned based on shared domain architectures. } \examples{ \dontrun{ -createUndirectedGenomicContextNetwork(pspa) +createUndirectedGenomicContextNetwork(pspa, column = "DomArch", +domains_of_interest = c("Domain1", "Domain2"), +cutoff_type = "Total Count", cutoff = 10) } } diff --git a/man/createWordCloud2Element.Rd b/man/createWordCloud2Element.Rd index a6279e2f..b1fd827f 100644 --- a/man/createWordCloud2Element.Rd +++ b/man/createWordCloud2Element.Rd @@ -15,7 +15,18 @@ createWordCloud2Element( \item{query_data}{Data frame of protein homologs with the usual 11 columns + additional word columns (0/1 format). Default is "prot".} -\item{UsingRowsCutoff}{} +\item{colname}{Character. The name of the column in \code{query_data} to generate +the word cloud from. Default is "DomArch".} + +\item{cutoff}{Numeric. The cutoff value for filtering elements based on their +frequency. Default is 70.} + +\item{UsingRowsCutoff}{Logical. Whether to use a row-based cutoff instead of +a frequency cutoff. Default is FALSE.} +} +\value{ +A word cloud plot showing the frequency of elements from the selected +column. } \description{ Wordclouds for the predominant domains (from DAs) and DAs (from GC) diff --git a/man/createWordCloudElement.Rd b/man/createWordCloudElement.Rd index 7f27ef41..42b32da0 100644 --- a/man/createWordCloudElement.Rd +++ b/man/createWordCloudElement.Rd @@ -15,7 +15,18 @@ createWordCloudElement( \item{query_data}{Data frame of protein homologs with the usual 11 columns + additional word columns (0/1 format). Default is "prot".} -\item{UsingRowsCutoff}{} +\item{colname}{Character. The name of the column in \code{query_data} to generate +the word cloud from. Default is "DomArch".} + +\item{cutoff}{Numeric. The cutoff value for filtering elements based on their +frequency. Default is 70.} + +\item{UsingRowsCutoff}{Logical. Whether to use a row-based cutoff instead of +a frequency cutoff. Default is FALSE.} +} +\value{ +A word cloud plot showing the frequency of elements from the selected +column. } \description{ Wordclouds for the predominant domains (from DAs) and DAs (from GC) diff --git a/man/downloadAssemblySummary.Rd b/man/downloadAssemblySummary.Rd index 636af878..e67aba70 100644 --- a/man/downloadAssemblySummary.Rd +++ b/man/downloadAssemblySummary.Rd @@ -10,13 +10,25 @@ downloadAssemblySummary( ) } \arguments{ -\item{outpath}{String of path where the assembly summary file should be written} +\item{outpath}{String of path where the assembly summary file should be +written} -\item{keep}{Character vector containing which columns should be retained and downloaded} +\item{keep}{Character vector containing which columns should be retained and +downloaded} +} +\value{ +A tab-separated file containing the assembly summary. The function +does notreturn any value but writes the output directly to the specified file. } \description{ Download the combined assembly summaries of genbank and refseq } +\examples{ +\dontrun{ +downloadAssemblySummary(outpath = "assembly_summary.tsv", + keep = c("assembly_accession", "taxid", "organism_name")) +} +} \author{ Samuel Chen, Janani Ravi } diff --git a/man/efetchIPG.Rd b/man/efetchIPG.Rd index 047e2652..e55c342a 100644 --- a/man/efetchIPG.Rd +++ b/man/efetchIPG.Rd @@ -14,13 +14,17 @@ the ipg database} \item{out_path}{Path to write the efetch results to} -\item{plan}{} +\item{plan}{Character. Specifies the execution plan for parallel processing. +Default is "multicore".} \item{accnums}{Character vector containing the accession numbers to query on the ipg database} } \value{ No return value. The function writes the fetched results to \code{out_path}. + +The function does not return a value but writes the efetch results +directly to the specified \code{out_path}. } \description{ Perform efetch on the ipg database and write the results to out_path @@ -31,6 +35,12 @@ Perform efetch on the ipg database and write the results to out_path \dontrun{ efetchIPG() } +\dontrun{ +efetchIPG( + accessions = c("P12345", "Q67890", "A12345"), + out_path = "path/to/efetch_results.xml" +) +} } \author{ Samuel Chen, Janani Ravi diff --git a/man/extractAccNum.Rd b/man/extractAccNum.Rd index 15870f3f..caf9e5db 100644 --- a/man/extractAccNum.Rd +++ b/man/extractAccNum.Rd @@ -7,7 +7,8 @@ extractAccNum(string) } \arguments{ -\item{string}{} +\item{string}{A string from which to extract the accession number. +The string may contain accession information delimited by \code{|} or spaces.} } \value{ Describe return, in detail diff --git a/man/generateAllAlignments2FA.Rd b/man/generateAllAlignments2FA.Rd index 8f9d8ffc..421d8cf7 100644 --- a/man/generateAllAlignments2FA.Rd +++ b/man/generateAllAlignments2FA.Rd @@ -22,15 +22,24 @@ generateAllAlignments2FA( \item{aln_path}{Character. Path to alignment files. Default is 'here("data/rawdata_aln/")'} -\item{fa_outpath}{Character. Path to the written fasta file. -Default is 'here("data/alns/")'.} - -\item{lin_file}{Character. Path to file. Master protein file with AccNum & lineages. +\item{fa_outpath}{Character. Path to file. Master protein file with AccNum & +lineages. Default is 'here("data/rawdata_tsv/all_semiclean.txt")'} -\item{reduced}{Boolean. If TRUE, the fasta file will contain only one sequence per lineage. +\item{lin_file}{Character. Path to the written fasta file. +Default is 'here("data/alns/")'.} + +\item{reduced}{Boolean. If TRUE, the fasta file will contain only one +sequence per lineage. Default is 'FALSE'.} } +\value{ +NULL. The function saves the output FASTA files to the specified +directory. + +NULL. The function saves the output FASTA files to the specified +directory. +} \description{ Adding Leaves to an alignment file w/ accessions @@ -41,14 +50,24 @@ Adding Leaves to an alignment file w/ accessions Adding Leaves to all alignment files w/ accessions & DAs? } \details{ -The alignment files would need two columns separated by spaces: 1. AccNum and 2. alignment. The protein homolog file should have AccNum, Species, Lineages. +The alignment files would need two columns separated by spaces: +\enumerate{ +\item AccNum and 2. alignment. The protein homolog file should have AccNum, +Species, Lineages. +} -The alignment files would need two columns separated by spaces: 1. AccNum and 2. alignment. The protein homolog file should have AccNum, Species, Lineages. +The alignment files would need two columns separated by spaces: +\enumerate{ +\item AccNum and 2. alignment. The protein homolog file should have AccNum, +Species, Lineages. +} } \note{ -Please refer to the source code if you have alternate + file formats and/or column names. +Please refer to the source code if you have alternate + file formats +and/or column names. -Please refer to the source code if you have alternate + file formats and/or column names. +Please refer to the source code if you have alternate + file formats +and/or column names. } \examples{ \dontrun{ @@ -58,9 +77,6 @@ generateAllAlignments2FA() generateAllAlignments2FA() } } -\author{ -Janani Ravi -} \keyword{accnum,} \keyword{alignment,} \keyword{leaves,} diff --git a/man/getAccNumFromFA.Rd b/man/getAccNumFromFA.Rd index d3ab8177..4c6179a1 100644 --- a/man/getAccNumFromFA.Rd +++ b/man/getAccNumFromFA.Rd @@ -9,10 +9,26 @@ getAccNumFromFA(fasta_file) getAccNumFromFA(fasta_file) } \arguments{ -\item{fasta_file}{} +\item{fasta_file}{Character. Path to the FASTA file from which +accession numbers will be extracted.} +} +\value{ +A character vector containing the extracted accession numbers. + +A character vector containing the extracted accession numbers. } \description{ getAccNumFromFA getAccNumFromFA } +\examples{ +\dontrun{ +getAccNumFromFA("my_sequences.fasta") +} +\dontrun{ +# Example usage +accnums <- getAccNumFromFA("path/to/sequences.fasta") +accnums +} +} diff --git a/man/getTopAccByLinDomArch.Rd b/man/getTopAccByLinDomArch.Rd index b8571350..0eeb0610 100644 --- a/man/getTopAccByLinDomArch.Rd +++ b/man/getTopAccByLinDomArch.Rd @@ -13,8 +13,32 @@ getTopAccByLinDomArch( ) } \arguments{ -\item{query}{} +\item{infile_full}{A data frame containing the full dataset with lineage and +domain architecture information.} + +\item{DA_col}{A string representing the name of the domain architecture +column. Default is "DomArch.Pfam".} + +\item{lin_col}{A string representing the name of the lineage column. +Default is "Lineage_short".} + +\item{n}{An integer specifying the number of top accession numbers to return. +Default is 20.} + +\item{query}{A string for filtering a specific query name. If it is not +"All", only the data matching this query will be processed.} +} +\value{ +A vector of the top N accession numbers (\code{AccNum}) based on counts +grouped by lineage and domain architecture. } \description{ Group by lineage + DA then take top 20 } +\examples{ +\dontrun{ +top_accessions <- getTopAccByLinDomArch(infile_full = my_data, +DA_col = "DomArch.Pfam", lin_col = "Lineage_short", +n = 20, query = "specific_query_name") +} +} diff --git a/man/mapAcc2Name.Rd b/man/mapAcc2Name.Rd index 39ecb065..3213201a 100644 --- a/man/mapAcc2Name.Rd +++ b/man/mapAcc2Name.Rd @@ -9,7 +9,7 @@ mapAcc2Name(line, acc2name, acc_col = "AccNum", name_col = "Name") mapAcc2Name(line, acc2name, acc_col = "AccNum", name_col = "Name") } \arguments{ -\item{line}{he line of a fasta file starting with '>'} +\item{line}{The line of a fasta file starting with '>'} \item{acc2name}{Data Table containing a column of accession numbers and a name column} @@ -18,8 +18,27 @@ mapAcc2Name(line, acc2name, acc_col = "AccNum", name_col = "Name") \item{name_col}{Name of the column containing the names that the accession numbers are mapped to} } +\value{ +A character string representing the updated FASTA line, where the +accession number is replaced with its corresponding name. + +Character string. The modified line from the Fasta file header with +the name instead of the accession number. +} \description{ Default renameFA() replacement function. Maps an accession number to its name Default rename_fasta() replacement function. Maps an accession number to its name } +\examples{ +\dontrun{ +mapAcc2Name(">P12345 some description", acc2name, "AccNum", "Name") +} +\dontrun{ +acc2name_table <- data.table(AccNum = c("ACC001", "ACC002"), +Name = c("Species A", "Species B")) +line <- ">ACC001 some additional info" +mapped_line <- mapAcc2Name(line, acc2name_table) +mapped_line # Expected output: ">Species A" +} +} diff --git a/man/plotIPR2Viz.Rd b/man/plotIPR2Viz.Rd index 7ed420c9..13ac06c1 100644 --- a/man/plotIPR2Viz.Rd +++ b/man/plotIPR2Viz.Rd @@ -17,8 +17,51 @@ plotIPR2Viz( ) } \arguments{ -\item{query}{} +\item{infile_ipr}{A path to the input IPR file (TSV format) containing +domain information.} + +\item{infile_full}{A path to the full input file (TSV format) containing +lineage and accession information.} + +\item{accessions}{A character vector of accession numbers to filter the +analysis. Default is an empty vector.} + +\item{analysis}{A character vector specifying the types of analysis to +include (e.g., "Pfam", "Phobius", "TMHMM", "Gene3D"). Default is a +vector of these analyses.} + +\item{group_by}{A string specifying how to group the visualization. +Default is "Analysis". Options include "Analysis" or "Query".} + +\item{topn}{An integer specifying the number of top accessions to visualize. +Default is 20.} + +\item{name}{A string representing the name to use for y-axis labels. +Default is "Name".} + +\item{text_size}{An integer specifying the text size for the plot. +Default is 15.} + +\item{query}{A string for filtering a specific query name. If it is not +"All", only the data matching this query will be processed.} +} +\value{ +A ggplot object representing the domain architecture visualization. } \description{ plotIPR2Viz } +\examples{ +\dontrun{ +plot <- plotIPR2Viz(infile_ipr = "path/to/ipr_file.tsv", + infile_full = "path/to/full_file.tsv", + accessions = c("ACC123", "ACC456"), + analysis = c("Pfam", "TMHMM"), + group_by = "Analysis", + topn = 20, + name = "Gene Name", + text_size = 15, + query = "All") +plot +} +} diff --git a/man/plotIPR2VizWeb.Rd b/man/plotIPR2VizWeb.Rd index 3b94a5a7..e56d917e 100644 --- a/man/plotIPR2VizWeb.Rd +++ b/man/plotIPR2VizWeb.Rd @@ -17,8 +17,52 @@ plotIPR2VizWeb( ) } \arguments{ -\item{rows}{} +\item{infile_ipr}{A path to the input IPR file (TSV format) containing +domain information.} + +\item{accessions}{A character vector of accession numbers to filter the +analysis.} + +\item{analysis}{A character vector specifying the types of analysis to +include (e.g., "Pfam", "Phobius", "TMHMM", "Gene3D"). Default is a vector +of these analyses.} + +\item{group_by}{A string specifying how to group the visualization. +Default is "Analysis". Options include "Analysis" or "Query".} + +\item{name}{A string representing the name to use for y-axis labels. +Default is "Name".} + +\item{text_size}{An integer specifying the text size for the plot. +Default is 15.} + +\item{legend_name}{A string representing the column to use for legend labels. +Default is "ShortName".} + +\item{cols}{An integer specifying the number of columns in the facet wrap. +Default is 5.} + +\item{rows}{An integer specifying the number of rows in the legend. +Default is 10.} +} +\value{ +A ggplot object representing the domain architecture visualization +for web display. } \description{ plotIPR2VizWeb } +\examples{ +\dontrun{ +plot <- plotIPR2VizWeb(infile_ipr = "path/to/ipr_file.tsv", + accessions = c("ACC123", "ACC456"), + analysis = c("Pfam", "TMHMM"), + group_by = "Analysis", + name = "Gene Name", + text_size = 15, + legend_name = "ShortName", + cols = 5, + rows = 10) +plot +} +} diff --git a/man/plotLineageDA.Rd b/man/plotLineageDA.Rd index 7e84bcfd..a752eb9b 100644 --- a/man/plotLineageDA.Rd +++ b/man/plotLineageDA.Rd @@ -20,9 +20,17 @@ Default is prot (variable w/ protein data).} \item{colname}{Column name from query_data: "DomArch.norep", "GenContext.norep", "DomArch.PFAM.norep" or "DomArch.LADB.norep". Default is "DomArch.norep".} +\item{cutoff}{Numeric. Cutoff for word frequency. Default is 90.} + +\item{RowsCutoff}{Boolean. If TRUE, applies a row cutoff to remove data rows +based on a certain condition. Default is FALSE.} + \item{color}{Color for the heatmap. One of six options: "default", "magma", "inferno", "plasma", "viridis", or "cividis"} } +\value{ +A LineageDA plot object. +} \description{ Lineage plot for Domains, Domain Architectures and Genomic Contexts. Heatmap. diff --git a/man/plotLineageDomainRepeats.Rd b/man/plotLineageDomainRepeats.Rd index 8ccfba41..45d31d68 100644 --- a/man/plotLineageDomainRepeats.Rd +++ b/man/plotLineageDomainRepeats.Rd @@ -7,7 +7,16 @@ plotLineageDomainRepeats(query_data, colname) } \arguments{ -\item{colname}{} +\item{query_data}{Data frame containing protein homolog data, including +relevant domain architectures and lineages.} + +\item{colname}{Character. The name of the column in query_data that contains +domain architectures or other structural information.} +} +\value{ +A ggplot object representing a heatmap (tile plot) of domain repeat +counts across different lineages, with color intensity representing the +occurrence of domains. } \description{ Lineage Domain Repeats Plot diff --git a/man/plotLineageHeatmap.Rd b/man/plotLineageHeatmap.Rd index 5449f8ec..e6870edb 100644 --- a/man/plotLineageHeatmap.Rd +++ b/man/plotLineageHeatmap.Rd @@ -15,6 +15,11 @@ plotLineageHeatmap(prot, domains_of_interest, level = 3, label.size = 8) \item{label.size}{Size of the text labels} } +\value{ +A ggplot object representing a heatmap (tile plot) of domain repeat +counts across different lineages, with color intensity representing the +occurrence of domains. +} \description{ Generate a lineage plot } diff --git a/man/plotLineageNeighbors.Rd b/man/plotLineageNeighbors.Rd index 85adf175..2c7ca448 100644 --- a/man/plotLineageNeighbors.Rd +++ b/man/plotLineageNeighbors.Rd @@ -18,6 +18,11 @@ additional word columns (0/1 format). Default is pspa_data.} \item{colname}{Column name from query_data. Default is "GenContext.norep".} } +\value{ +A ggplot object representing a heatmap (tile plot) of lineage versus +the top neighboring domain architectures, with color intensity representing +the frequency of occurrences. +} \description{ Lineage plot for top neighbors obtained from DAs of Genomic Contexts. diff --git a/man/plotLineageQuery.Rd b/man/plotLineageQuery.Rd index ad52a4d2..aa3793b7 100644 --- a/man/plotLineageQuery.Rd +++ b/man/plotLineageQuery.Rd @@ -17,9 +17,22 @@ plotLineageQuery( additional word columns (0/1 format). Default is prot (variable w/ protein data).} -\item{queries}{Character Vector containing the queries that will be used for the categories} +\item{queries}{Character Vector containing the queries that will be used for +the categories.} -\item{color}{} +\item{colname}{Character. The column used for filtering based on the \code{queries}. +Default is "ClustName".} + +\item{cutoff}{Numeric. The cutoff value for filtering rows based on their +total count. Rows with values below this cutoff are excluded.} + +\item{color}{Character. Defines the color palette used for the heatmap. +Default is a red gradient.} +} +\value{ +A ggplot object representing a heatmap (tile plot) showing the +relationship between queries and lineages, with the intensity of color +representing the count of matching records. } \description{ Lineage plot for queries. Heatmap. @@ -33,6 +46,9 @@ column names. plotLineageQuery(prot, c("PspA", "PspB", "PspC", "PspM", "PspN"), 95) } } +\author{ +Janani Ravi, Samuel Chen +} \keyword{Architectures,} \keyword{Domain} \keyword{Domains,} diff --git a/man/plotLineageSunburst.Rd b/man/plotLineageSunburst.Rd index 972bbe5d..363e8c27 100644 --- a/man/plotLineageSunburst.Rd +++ b/man/plotLineageSunburst.Rd @@ -16,27 +16,40 @@ plotLineageSunburst( ) } \arguments{ -\item{prot}{Data frame containing a lineage column that the sunburst plot will be generated for} +\item{prot}{Data frame containing a lineage column that the sunburst plot +will be generated for} -\item{lineage_column}{String. Name of the lineage column within the data frame. Defaults to "Lineage"} +\item{lineage_column}{String. Name of the lineage column within the +data frame. Defaults to "Lineage"} -\item{type}{String, either "sunburst" or "sund2b". If type is "sunburst", a sunburst plot of the lineage} +\item{type}{String, either "sunburst" or "sund2b". If type is "sunburst", +a sunburst plot of the lineage} \item{levels}{Integer. Number of levels the sunburst will have.} -\item{legendOrder}{String vector. The order of the legend. If legendOrder is NULL,} +\item{colors}{A vector of colors for the sunburst plot. +If NULL, default colors are used.} -\item{showLegend}{Boolean. If TRUE, the legend will be enabled when the component first renders.} +\item{legendOrder}{String vector. The order of the legend. If legendOrder +is NULL,} -\item{maxLevels}{Integer, the maximum number of levels to display in the sunburst; 5 by default, NULL to disable -then the legend will be in the descending order of the top level hierarchy. -will be rendered. If the type is sund2b, a sund2b plot will be rendered.} +\item{showLegend}{Boolean. If TRUE, the legend will be enabled when the +component first renders.} + +\item{maxLevels}{Integer, the maximum number of levels to display in the +sunburst; 5 by default, NULL to disable then the legend will be in the +descending order of the top level hierarchy. will be rendered. If the type is +sund2b, a sund2b plot will be rendered.} +} +\value{ +A sunburst or sund2b plot based on the input lineage data. } \description{ Lineage Sunburst } \examples{ \dontrun{ -plotLineageSunburst() +plotLineageSunburst(prot, lineage_column = "Lineage", +type = "sunburst", levels = 3) } } diff --git a/man/plotStackedLineage.Rd b/man/plotStackedLineage.Rd index 9d1cde6d..63ae9b66 100644 --- a/man/plotStackedLineage.Rd +++ b/man/plotStackedLineage.Rd @@ -21,7 +21,44 @@ plotStackedLineage( ) } \arguments{ -\item{legend}{} +\item{prot}{Data frame containing protein data including domain architecture +and lineage information.} + +\item{column}{Character. The name of the column in prot representing domain +architectures (default is "DomArch").} + +\item{cutoff}{Numeric. A threshold value for filtering domain architectures +or protein counts.} + +\item{Lineage_col}{Character. The name of the column representing lineage +data (default is "Lineage").} + +\item{xlabel}{Character. Label for the x-axis +(default is "Domain Architecture").} + +\item{reduce_lineage}{Logical. Whether to shorten lineage names +(default is TRUE).} + +\item{label.size}{Numeric. The size of axis text labels (default is 8).} + +\item{legend.position}{Numeric vector. Coordinates for placing the legend +(default is c(0.7, 0.4)).} + +\item{legend.text.size}{Numeric. Size of the text in the legend +(default is 10).} + +\item{legend.cols}{Numeric. Number of columns in the legend (default is 2).} + +\item{legend.size}{Numeric. Size of the legend keys (default is 0.7).} + +\item{coord_flip}{Logical. Whether to flip the coordinates of the plot +(default is TRUE).} + +\item{legend}{Logical. Whether to display the legend (default is TRUE).} +} +\value{ +A ggplot object representing a stacked bar plot showing the +distribution of protein domain architectures across lineages. } \description{ Stacked Lineage Plot diff --git a/man/plotSunburst.Rd b/man/plotSunburst.Rd index 5ee465a6..37da9df5 100644 --- a/man/plotSunburst.Rd +++ b/man/plotSunburst.Rd @@ -10,11 +10,11 @@ plotSunburst(count_data, fill_by_n = FALSE, sort_by_n = FALSE, maxdepth = 2) plotTreemap(count_data, fill_by_n = FALSE, sort_by_n = FALSE) } \arguments{ -\item{count_data}{} +\item{count_data}{A data frame containing the data.} -\item{fill_by_n}{If TRUE, uses a continuous scale to fill plot by group size} +\item{fill_by_n}{Logical indicating if fill color is based on counts.} -\item{sort_by_n}{} +\item{sort_by_n}{Logical indicating if data should be sorted by counts.} } \description{ These functions help you quickly create interactive hierarchical plots diff --git a/man/plotUpSet.Rd b/man/plotUpSet.Rd index 84169987..47dd12e1 100644 --- a/man/plotUpSet.Rd +++ b/man/plotUpSet.Rd @@ -18,15 +18,30 @@ plotUpSet( \item{query_data}{Data frame of protein homologs with the usual 11 columns + additional word columns (0/1 format). Default is toast_rack.sub} +\item{colname}{Column name from query_data: "DomArch.norep", "GenContext.norep", +"DomArch.PFAM.norep" or "DomArch.LADB.norep". Default is "DomArch.norep".} + \item{cutoff}{Numeric. Cutoff for word frequency. Default is 90.} -\item{text.scale}{Allows scaling of axis title, tick lables, and numbers above the intersection size bars. +\item{RowsCutoff}{Boolean. If TRUE, applies a row cutoff to remove data rows +based on a certain condition. Default is FALSE.} + +\item{text.scale}{Allows scaling of axis title, tick lables, and numbers +above the intersection size bars. text.scale can either take a universal scale in the form of an integer, or a vector of specific scales in the format: c(intersection size title, intersection size tick labels, set size title, set size tick labels, set names, numbers above bars)} -\item{line.size}{} +\item{point.size}{Numeric. Sets the size of points in the UpSet plot. +Default is 2.2.} + +\item{line.size}{Numeric. Sets the line width in the UpSet plot. +Default is 0.8.} +} +\value{ +An UpSet plot object. The plot visualizes intersections of sets based +on the provided colname in query_data. } \description{ UpSet plot for Domain Architectures vs Domains and diff --git a/man/prepareColumnParams.Rd b/man/prepareColumnParams.Rd index bb0b9a29..f685624e 100644 --- a/man/prepareColumnParams.Rd +++ b/man/prepareColumnParams.Rd @@ -7,8 +7,23 @@ prepareColumnParams(count_data, fill_by_n, sort_by_n) } \arguments{ -\item{sort_by_n}{} +\item{count_data}{A data frame containing the data.} + +\item{fill_by_n}{Logical indicating if fill color is based on counts.} + +\item{sort_by_n}{Logical indicating if data should be sorted by counts.} +} +\value{ +A data frame of parameters for treemap visualization. } \description{ prepareColumnParams } +\examples{ +\dontrun{ +count_data <- data.frame(Category = c("A", "B", "C"), + n = c(10, 20, 15)) +params <- prepareColumnParams(count_data, fill_by_n = TRUE, sort_by_n = FALSE) +params +} +} diff --git a/man/prepareSingleColumnParams.Rd b/man/prepareSingleColumnParams.Rd index d823852b..0261f9c1 100644 --- a/man/prepareSingleColumnParams.Rd +++ b/man/prepareSingleColumnParams.Rd @@ -7,8 +7,24 @@ prepareSingleColumnParams(df, col_num, root) } \arguments{ -\item{root}{} +\item{df}{A data frame containing the data to be processed.} + +\item{col_num}{An integer representing the column number to process.} + +\item{root}{A string representing the root node for the treemap.} +} +\value{ +A data frame containing parameters for the specified column for +treemap visualization. } \description{ prepareSingleColumnParams } +\examples{ +\dontrun{ +df <- data.frame(Category = c("A", "A", "B", "B", "C"), + n = c(10, 20, 30, 40, 50)) +params <- prepareSingleColumnParams(df, col_num = 1, root = "Root") +params +} +} diff --git a/man/proteinAcc2TaxID.Rd b/man/proteinAcc2TaxID.Rd index c0317bba..1ccafe4f 100644 --- a/man/proteinAcc2TaxID.Rd +++ b/man/proteinAcc2TaxID.Rd @@ -7,8 +7,32 @@ proteinAcc2TaxID(accnums, suffix, out_path, return_dt = FALSE) } \arguments{ -\item{return_dt}{} +\item{accnums}{A character vector of protein accession numbers to be mapped +to TaxIDs.} + +\item{suffix}{A string suffix used to name the output file generated by the +script.} + +\item{out_path}{A string specifying the directory where the output file will +be saved.} + +\item{return_dt}{A logical indicating whether to return the result as a data +table. Defaults to FALSE. If TRUE, the output file is read into a data table +and returned.} +} +\value{ +If \code{return_dt} is TRUE, a data table containing the mapping of protein +accession numbers to TaxIDs. If FALSE, the function returns NULL. } \description{ proteinAcc2TaxID } +\examples{ +\dontrun{ +# Example accession numbers +accessions <- c("ABC123", "XYZ456", "LMN789") +tax_data <- proteinAcc2TaxID(accessions, suffix = "example", +out_path = "/path/to/output", return_dt = TRUE) +tax_data +} +} diff --git a/man/proteinAcc2TaxID_old.Rd b/man/proteinAcc2TaxID_old.Rd index 0c2a85ba..fb6cd5a0 100644 --- a/man/proteinAcc2TaxID_old.Rd +++ b/man/proteinAcc2TaxID_old.Rd @@ -7,17 +7,29 @@ proteinAcc2TaxID_old(accessions, out_path, plan = "multicore") } \arguments{ -\item{accessions}{Character vector containing the accession numbers to query on -the ipg database} +\item{accessions}{A character vector containing the accession numbers to query +in the protein database.} -\item{out_path}{Path to write the efetch results to} +\item{out_path}{A string specifying the path where the results of the query +will be written. If set to NULL, a temporary directory will be used.} -\item{plan}{} +\item{plan}{A character string that specifies the execution plan for parallel +processing. The default is "multicore".} +} +\value{ +This function does not return a value. It writes the results to the +specified output path. } \description{ Perform elink to go from protein database to taxonomy database and write the resulting file of taxid and lineage to out_path } +\examples{ +\dontrun{ +accessions <- c("ABC123", "XYZ456", "LMN789") +proteinAcc2TaxID_old(accessions, out_path = "/path/to/output") +} +} \author{ Samuel Chen, Janani Ravi } diff --git a/man/removeAsterisks.Rd b/man/removeAsterisks.Rd index 691a7adf..c62b7651 100644 --- a/man/removeAsterisks.Rd +++ b/man/removeAsterisks.Rd @@ -2,15 +2,19 @@ % Please edit documentation in R/cleanup.R \name{removeAsterisks} \alias{removeAsterisks} -\title{Remove Astrk} +\title{Remove Asterisk} \usage{ removeAsterisks(query_data, colname = "GenContext") } \arguments{ -\item{colname}{} +\item{query_data}{A data frame containing the data to be processed.} + +\item{colname}{The name of the column from which asterisks should be removed. +Defaults to "GenContext".} } \value{ -Describe return, in detail +The original data frame with asterisks removed from the specified +column. } \description{ Remove the asterisks from a column of data diff --git a/man/removeEmptyRows.Rd b/man/removeEmptyRows.Rd index 66551810..4e52cc99 100644 --- a/man/removeEmptyRows.Rd +++ b/man/removeEmptyRows.Rd @@ -13,7 +13,8 @@ removeEmptyRows(prot, by_column = "DomArch") Default column is 'DomArch'. Can also take the following as input, 'Species', 'GenContext', 'ClustName'.} } \value{ -Describe return, in detail +A tibble with rows removed where the specified column contains +\code{"-"}, \code{"NA"}, or an empty string. } \description{ Remove empty rows by column diff --git a/man/removeTails.Rd b/man/removeTails.Rd index 76d1e18a..0c63e89d 100644 --- a/man/removeTails.Rd +++ b/man/removeTails.Rd @@ -14,7 +14,8 @@ removeTails(prot, by_column = "DomArch", keep_domains = FALSE) \item{keep_domains}{Default is False Keeps tail entries that contain the query domains.} } \value{ -Describe return, in detail +The original data frame with singletons removed from the specified +column. } \description{ Remove tails/singletons diff --git a/man/renameFA.Rd b/man/renameFA.Rd index 7b6fd579..18eca8b9 100644 --- a/man/renameFA.Rd +++ b/man/renameFA.Rd @@ -15,6 +15,15 @@ renameFA(fa_path, outpath, replacement_function = mapAcc2Name, ...) \item{...}{Additional arguments to pass to replacement_function} } +\value{ +A character vector of the modified lines in the FASTA file. +} \description{ Rename the labels of fasta files } +\examples{ +\dontrun{ +renameFA("path/to/input.fasta", +"path/to/output.fasta", mapAcc2Name, acc2name) +} +} diff --git a/man/rename_fasta.Rd b/man/rename_fasta.Rd index 6b4e5dd7..35658437 100644 --- a/man/rename_fasta.Rd +++ b/man/rename_fasta.Rd @@ -15,6 +15,15 @@ rename_fasta(fa_path, outpath, replacement_function = map_acc2name, ...) \item{...}{Additional arguments to pass to replacement_function} } +\value{ +Character vector containing the modified lines of the Fasta file. +} \description{ Rename the labels of fasta files } +\examples{ +\dontrun{ +rename_fasta("input.fasta", "output.fasta", +replacement_function = map_acc2name, acc2name = acc2name_table) +} +} diff --git a/man/replaceQuestionMarks.Rd b/man/replaceQuestionMarks.Rd index 0949568f..8b16992a 100644 --- a/man/replaceQuestionMarks.Rd +++ b/man/replaceQuestionMarks.Rd @@ -12,7 +12,9 @@ replaceQuestionMarks(prot, by_column = "GenContext") \item{by_column}{Column to operate on} } \value{ -Describe return, in detail +The original data frame with the specified column updated. All +consecutive '?' characters will be replaced with 'X(s)', and individual '?' +characters will be replaced with 'X'. } \description{ Replace consecutive '?' separated by '->', '<-' or '||' with 'X(s)' diff --git a/man/reverseOperonSeq.Rd b/man/reverseOperonSeq.Rd index d61ec5f2..812d0e89 100644 --- a/man/reverseOperonSeq.Rd +++ b/man/reverseOperonSeq.Rd @@ -2,13 +2,31 @@ % Please edit documentation in R/reverse_operons.R \name{reverseOperonSeq} \alias{reverseOperonSeq} -\title{reverseOperonSeq} +\title{reverseOperon: Reverse the Direction of Operons in Genomic ContextSeq} \usage{ reverseOperonSeq(prot) } \arguments{ -\item{prot}{} +\item{prot}{\link{data.frame} A data frame containing at least a column named +'GenContext', which represents the genomic contexts that need to be reversed.} +} +\value{ +\link{data.frame} The input data frame with the 'GenContext' column updated t +o reflect the reversed operons. } \description{ -reverseOperonSeq +This function processes a genomic context data frame to reverse the direction +of operons based on specific patterns in the GenContext column. It handles +elements represented by ">" and "<" and restructures the genomic context by +flipping the direction of operons while preserving the relationships +indicated by "=". +} +\examples{ +\dontrun{ +# Example genomic context data frame +## Rework example data, does not pass R-CMD Check +prot <- data.frame(GenContext = c("A>B", "CI")) +reversed_prot <- reverseOperonSeq(prot) +reversed_prot +} } diff --git a/man/runDeltaBlast.Rd b/man/runDeltaBlast.Rd index 8a32b954..c3384d12 100644 --- a/man/runDeltaBlast.Rd +++ b/man/runDeltaBlast.Rd @@ -5,7 +5,7 @@ \title{Run DELTABLAST to find homologs for proteins of interest} \usage{ runDeltaBlast( - deltablast_path, + runDeltaBlast, db_search_path, db = "refseq", query, @@ -16,12 +16,35 @@ runDeltaBlast( ) } \arguments{ -\item{db_search_path}{Path to the BLAST databases} +\item{db_search_path}{Path to the BLAST databases.} -\item{num_threads}{} +\item{db}{Name of the BLAST database to search against (default is "refseq").} + +\item{query}{Path to the input query file.} + +\item{evalue}{E-value threshold for reporting matches (default is "1e-5").} + +\item{out}{Path to the output file where results will be saved.} + +\item{num_alignments}{Number of alignments to report.} + +\item{num_threads}{Number of threads to use for the search (default is 1).} + +\item{deltablast_path}{Path to the Delta-BLAST executable.} +} +\value{ +This function does not return a value; it outputs results to the +specified file. } \description{ -Run DELTABLAST to find homologs for proteins of interest +This function executes a Delta-BLAST search using the specified parameters +and database. It sets the BLAST database path, runs the Delta-BLAST command +with the given query, and outputs the results. +} +\examples{ +\dontrun{ +runDeltaBlast(runDeltaBlast, db_search_path) +} } \author{ Samuel Chen, Janani Ravi diff --git a/man/runIPRScan.Rd b/man/runIPRScan.Rd index 678d8652..f675314d 100644 --- a/man/runIPRScan.Rd +++ b/man/runIPRScan.Rd @@ -7,8 +7,28 @@ runIPRScan(filepath_fasta, filepath_out, appl = c("Pfam", "Gene3D")) } \arguments{ -\item{appl}{} +\item{filepath_fasta}{A string representing the path to the input FASTA file.} + +\item{filepath_out}{A string representing the base path for the output file.} + +\item{appl}{A character vector specifying the InterProScan applications to +use (e.g., "Pfam", "Gene3D"). Default is \code{c("Pfam", "Gene3D")}.} +} +\value{ +A data frame containing the results from the InterProScan output +TSV file. } \description{ -runIPRScan +Run InterProScan on a given FASTA file and save the results to an +output file. +} +\examples{ +\dontrun{ +results <- runIPRScan( + filepath_fasta = "path/to/your_fasta_file.fasta", + filepath_out = "path/to/output_file", + appl = c("Pfam", "Gene3D") +) +results +} } diff --git a/man/runRPSBlast.Rd b/man/runRPSBlast.Rd index 088254ea..47bf7ef3 100644 --- a/man/runRPSBlast.Rd +++ b/man/runRPSBlast.Rd @@ -15,10 +15,31 @@ runRPSBlast( ) } \arguments{ -\item{db_search_path}{Path to the BLAST databases} +\item{rpsblast_path}{Path to the RPS-BLAST executable.} -\item{num_threads}{} +\item{db_search_path}{Path to the BLAST databases.} + +\item{db}{Name of the BLAST database to search against (default is "refseq").} + +\item{query}{Path to the input query file.} + +\item{evalue}{E-value threshold for reporting matches (default is "1e-5").} + +\item{out}{Path to the output file where results will be saved.} + +\item{num_threads}{Number of threads to use for the search (default is 1).} +} +\value{ +This function does not return a value; it outputs results to the +specified file. } \description{ -Run RPSBLAST to generate domain architectures for proteins of interest +This function executes an RPS-BLAST search to generate domain architectures +for specified proteins. It sets the BLAST database path, runs the RPS-BLAST +command with the provided query, and outputs the results. +} +\examples{ +\dontrun{ +runRSPBlast(rpsblast_path, db_search_path, query, out) +} } diff --git a/man/selectLongestDuplicate.Rd b/man/selectLongestDuplicate.Rd index c177d289..bd535455 100644 --- a/man/selectLongestDuplicate.Rd +++ b/man/selectLongestDuplicate.Rd @@ -7,10 +7,15 @@ selectLongestDuplicate(prot, column) } \arguments{ -\item{column}{} +\item{prot}{A data frame containing the data, with at least one column +named 'AccNum' for identification of duplicates.} + +\item{column}{The name of the column from which the longest entry among +duplicates will be selected.} } \value{ -Describe return, in detail +A data frame containing only the longest entries among duplicates +based on the specified column. } \description{ Pick Longer Duplicate diff --git a/man/shortenLineage.Rd b/man/shortenLineage.Rd index f495fb32..7390b254 100644 --- a/man/shortenLineage.Rd +++ b/man/shortenLineage.Rd @@ -2,18 +2,34 @@ % Please edit documentation in R/plotting.R \name{shortenLineage} \alias{shortenLineage} -\title{Shorten Lineage} +\title{shortenLineage} \usage{ shortenLineage(data, colname = "Lineage", abr_len = 1) } \arguments{ -\item{abr_len}{} +\item{data}{A data frame that contains a column with lineage names to be +shortened.} + +\item{colname}{Character. The name of the column in the data frame containing +the lineage strings to be shortened. Default is \code{"Lineage"}.} + +\item{abr_len}{Integer. The number of characters to retain after the first +letter. If set to 1, only the first letter of each segment before the +delimiter (\code{>}) is retained. Default is 1.} +} +\value{ +A modified data frame where the specified lineage column has been +shortened. } \description{ -Shorten Lineage +This function abbreviates lineage names by shortening the first part of the +string (up to a given delimiter). } \examples{ \dontrun{ -shortenLineage() +df <- data.frame(Lineage = c("Bacteria>Firmicutes>Clostridia", +"Archaea>Euryarchaeota>Thermococci")) +shortened_df <- shortenLineage(df, colname = "Lineage", abr_len = 1) +shortened_df } } diff --git a/man/straightenOperonSeq.Rd b/man/straightenOperonSeq.Rd index fcd0c923..73f1463c 100644 --- a/man/straightenOperonSeq.Rd +++ b/man/straightenOperonSeq.Rd @@ -2,13 +2,26 @@ % Please edit documentation in R/reverse_operons.R \name{straightenOperonSeq} \alias{straightenOperonSeq} -\title{straightenOperonSeq} +\title{straightenOperonSeq: Reverse Equalities in Genomic Context} \usage{ straightenOperonSeq(prot) } \arguments{ -\item{prot}{} +\item{prot}{\link{vector} A vector of genomic context strings to be processed.} +} +\value{ +\link{vector} A vector of the same length as the input, where each genomic +element is annotated with either a forward ("->") or reverse ("<-") direction, +depending on its position relative to the "=" symbols. } \description{ -straightenOperonSeq +This function processes the genomic context strings (GenContext) and reverses +directional signs based on the presence of an equal sign ("="). +} +\examples{ +# Example input: Genomic context with directional symbols and an asterisk +genomic_context <- c("A", "B", "*", "C", "D", "=", "E", "F") +straightenOperonSeq(genomic_context) + +# Output: "A->", "B->", "*", "<-C", "<-D", "=", "E->", "F->" } diff --git a/man/themeGenes2.Rd b/man/themeGenes2.Rd index 64ae9273..739227b0 100644 --- a/man/themeGenes2.Rd +++ b/man/themeGenes2.Rd @@ -6,6 +6,19 @@ \usage{ themeGenes2() } +\value{ +A ggplot2 theme object. +} \description{ themeGenes2 } +\examples{ +library(ggplot2) + +# Create a sample plot using the custom theme +ggplot(mtcars, aes(x = wt, y = mpg)) + + geom_point() + + themeGenes2() + + labs(title = "Car Weight vs MPG") + +} diff --git a/man/validateCountDF.Rd b/man/validateCountDF.Rd index fc4aefa2..5943723e 100644 --- a/man/validateCountDF.Rd +++ b/man/validateCountDF.Rd @@ -7,8 +7,16 @@ validateCountDF(var) } \arguments{ -\item{var}{} +\item{var}{A data frame whose columns are to be converted.} +} +\value{ +A data frame with non-'n' columns converted to character type. } \description{ validateCountDF } +\examples{ +\dontrun{ +new_df <- .all_non_n_cols_to_char(my_data) +} +} diff --git a/man/wordcloud3.Rd b/man/wordcloud3.Rd index cce07a82..1406ea0d 100644 --- a/man/wordcloud3.Rd +++ b/man/wordcloud3.Rd @@ -25,8 +25,60 @@ wordcloud3( ) } \arguments{ -\item{hoverFunction}{} +\item{data}{Data frame or table containing words and their frequencies for +the word cloud.} + +\item{size}{Numeric. Scaling factor for word sizes (default is 1).} + +\item{minSize}{Numeric. Minimum font size for the smallest word +(default is 0).} + +\item{gridSize}{Numeric. Size of the grid for placing words (default is 0).} + +\item{fontFamily}{Character. Font family to use for the words +(default is "Segoe UI").} + +\item{fontWeight}{Character. Font weight for the words (default is "bold").} + +\item{color}{Character or vector. Color of the words. Use "random-dark" for +random dark colors (default) or specify a color.} + +\item{backgroundColor}{Character. Background color of the word cloud +(default is "white").} + +\item{minRotation}{Numeric. Minimum rotation angle of words in radians +(default is -π/4).} + +\item{maxRotation}{Numeric. Maximum rotation angle of words in radians +(default is π/4).} + +\item{shuffle}{Logical. Whether to shuffle the words (default is TRUE).} + +\item{rotateRatio}{Numeric. Proportion of words that are rotated +(default is 0.4).} + +\item{shape}{Character. Shape of the word cloud ("circle" is default, but +you can use "cardioid", "star", "triangle", etc.).} + +\item{ellipticity}{Numeric. Degree of ellipticity (default is 0.65).} + +\item{widgetsize}{Numeric vector. Width and height of the widget +(default is NULL, which uses default size).} + +\item{figPath}{Character. Path to an image file to use as a mask for the +word cloud (optional).} + +\item{hoverFunction}{JS function. JavaScript function to run when hovering +over words (optional).} +} +\value{ +An HTML widget object displaying a word cloud. } \description{ plotWordCloud3 } +\examples{ +\dontrun{ +wordcloud3(data = your_data, size = 1.5, color = "random-light") +} +} diff --git a/man/writeMSA_AA2FA.Rd b/man/writeMSA_AA2FA.Rd index a6798469..d0d5d305 100644 --- a/man/writeMSA_AA2FA.Rd +++ b/man/writeMSA_AA2FA.Rd @@ -13,6 +13,11 @@ writeMSA_AA2FA(alignment, outpath) \item{outpath}{Where the resulting FASTA file should be written to} } +\value{ +Character string representing the content of the written FASTA file. + +Character string of the FASTA content that was written to the file. +} \description{ MsaAAMultipleAlignment Objects are generated from calls to msaClustalOmega and msaMuscle from the 'msa' package @@ -21,6 +26,16 @@ Write MsaAAMultpleAlignment Objects as aligned fasta sequence MsaAAMultipleAlignment Objects are generated from calls to msaClustalOmega and msaMuscle from the 'msa' package } +\examples{ +\dontrun{ +writeMSA_AA2FA("my_sequences.fasta", outpath = "aligned_output.fasta") +} +\dontrun{ +# Example usage +alignment <- alignFasta("path/to/sequences.fasta") +writeMSA_AA2FA(alignment, "path/to/aligned_sequences.fasta") +} +} \author{ Samuel Chen, Janani Ravi }