Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ export(GCA2Lineage)
export(IPG2Lineage)
export(acc2FA)
export(acc2Lineage)
export(acc2fa)
export(addLeaves2Alignment)
export(addLineage)
export(addName)
Expand Down
96 changes: 71 additions & 25 deletions R/CHANGED-pre-msa-tree.R
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,14 @@ api_key <- Sys.getenv("ENTREZ_API_KEY", unset = "YOUR_KEY_HERE")
#' @param y Delimitter. Default is space (" ").
#' @seealso chartr, toupper, and tolower.
#'
#' @return
#' @return Character vector with the input strings converted to title case.
#'
#' @export
#'
#' @examples
#' # Convert a single string to title case
#' convert2TitleCase("hello world") # Returns "Hello World"
#'
convert2TitleCase <- function(x, y = " ") {
s <- strsplit(x, y)[[1]]
paste(toupper(substring(s, 1, 1)), substring(s, 2),
Expand Down Expand Up @@ -76,7 +80,8 @@ convert2TitleCase <- function(x, y = " ") {
#' @importFrom stringr str_sub
#' @importFrom tidyr replace_na separate
#'
#' @return
#' @return A data frame containing the enriched alignment data with lineage
#' information.
#'
#' @details The alignment file would need two columns: 1. accession +
#' number and 2. alignment. The protein homolog accession to lineage mapping +
Expand Down Expand Up @@ -203,6 +208,14 @@ addLeaves2Alignment <- function(aln_file = "",
#' @export
#'
#' @examples
#' # Example usage of the addName function
#' data <- data.frame(
#' AccNum = c("ACC123", "ACC456"),
#' Species = c("Homo sapiens", "Mus musculus"),
#' Lineage = c("Eukaryota>Chordata", "Eukaryota>Chordata")
#' )
#' enriched_data <- addName(data)
#' print(enriched_data)
Comment thread
awasyn marked this conversation as resolved.
Outdated
addName <- function(data,
accnum_col = "AccNum", spec_col = "Species", lin_col = "Lineage",
lin_sep = ">", out_col = "Name") {
Expand Down Expand Up @@ -278,7 +291,9 @@ addName <- function(data,
#' @note Please refer to the source code if you have alternate +
#' file formats and/or column names.
#'
#' @return
#' @return A character string representing the FASTA formatted sequences.
#' If `fa_outpath` is provided, the FASTA will also be saved to the specified
#' file.
#' @export
#'
#' @examples
Expand Down Expand Up @@ -326,18 +341,24 @@ convertAlignment2FA <- function(aln_file = "",
#' Default renameFA() replacement function. Maps an accession number to its name
#'
#' @param line The line of a fasta file starting with '>'
#' @param acc2name Data Table containing a column of accession numbers and a name column
#' @param acc2name Data Table containing a column of accession numbers and a
#' name column
#' @param acc_col Name of the column containing Accession numbers
#' @param name_col Name of the column containing the names that the accession numbers
#' @param name_col Name of the column containing the names that the accession
#' numbers
#' are mapped to
#'
#' @importFrom dplyr filter pull
#' @importFrom rlang sym
#'
#' @return
#' @return A character string representing the updated FASTA line, where the
#' accession number is replaced with its corresponding name.
#' @export
#'
#' @examples
#' \dontrun{
#' mapAcc2Name(">P12345 some description", acc2name, "AccNum", "Name")
#' }
mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") {
# change to be the name equivalent to an addNames column
# Find the first ' '
Expand All @@ -363,10 +384,14 @@ mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") {
#' @importFrom purrr map
#' @importFrom readr read_lines write_lines
#'
#' @return
#' @return A character vector of the modified lines in the FASTA file.
#' @export
#'
#' @examples
#' \dontrun{
#' renameFA("path/to/input.fasta",
#' "path/to/output.fasta", mapAcc2Name, acc2name)
#' }
renameFA <- function(fa_path, outpath,
replacement_function = mapAcc2Name, ...) {
lines <- read_lines(fa_path)
Expand Down Expand Up @@ -395,20 +420,26 @@ renameFA <- function(fa_path, outpath,
#'
#' @param aln_path Character. Path to alignment files.
#' Default is 'here("data/rawdata_aln/")'
#' @param fa_outpath Character. Path to file. Master protein file with AccNum & lineages.
#' @param fa_outpath Character. Path to file. Master protein file with AccNum &
#' lineages.
#' Default is 'here("data/rawdata_tsv/all_semiclean.txt")'
#' @param lin_file Character. Path to the written fasta file.
#' Default is 'here("data/alns/")'.
#' @param reduced Boolean. If TRUE, the fasta file will contain only one sequence per lineage.
#' @param reduced Boolean. If TRUE, the fasta file will contain only one
#' sequence per lineage.
#' Default is 'FALSE'.
#'
#' @importFrom purrr pmap
#' @importFrom stringr str_replace_all
#'
#' @return
#' @return NULL. The function saves the output FASTA files to the specified
#' directory.
#'
#' @details The alignment files would need two columns separated by spaces: 1. AccNum and 2. alignment. The protein homolog file should have AccNum, Species, Lineages.
#' @note Please refer to the source code if you have alternate + file formats and/or column names.
#' @details The alignment files would need two columns separated by spaces:
#' 1. AccNum and 2. alignment. The protein homolog file should have AccNum,
#' Species, Lineages.
#' @note Please refer to the source code if you have alternate + file formats
#' and/or column names.
#'
#' @export
#'
Expand Down Expand Up @@ -456,24 +487,29 @@ generateAllAlignments2FA <- function(aln_path = here("data/rawdata_aln/"),
#' @author Samuel Chen, Janani Ravi
#' @keywords accnum, fasta
#'
#' @param accessions Character vector containing protein accession numbers to generate fasta sequences for.
#' @param accessions Character vector containing protein accession numbers to
#' generate fasta sequences for.
#' Function may not work for vectors of length > 10,000
#' @param outpath [str] Location where fasta file should be written to.
#' @param plan
#' @param plan Character string specifying the parallel processing strategy to
#' use with the `future` package. Default is "sequential".
#'
#' @importFrom Biostrings readAAStringSet
#' @importFrom future future plan value
#' @importFrom purrr map
#' @importFrom rentrez entrez_fetch
#'
#' @return
#' @return A logical value indicating whether the retrieval and conversion were
#' successful. Returns `TRUE` if successful and `FALSE` otherwise.
#' @export
#'
#' @examples
#' \dontrun{
#' acc2FA(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"), outpath = "my_proteins.fasta")
#' acc2FA(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"),
#' outpath = "my_proteins.fasta")
#' Entrez:accessions <- rep("ANY95992.1", 201) |> acc2FA(outpath = "entrez.fa")
#' EBI:accessions <- c("P12345", "Q9UHC1", "O15530", "Q14624", "P0DTD1") |> acc2FA(outpath = "ebi.fa")
#' EBI:accessions <- c("P12345", "Q9UHC1",
#' "O15530", "Q14624", "P0DTD1") |> acc2FA(outpath = "ebi.fa")
#' }
acc2FA <- function(accessions, outpath, plan = "sequential") {
# validation
Expand Down Expand Up @@ -549,7 +585,8 @@ acc2FA <- function(accessions, outpath, plan = "sequential") {
#' createRepresentativeAccNum
#'
#' @description
#' Function to generate a vector of one Accession number per distinct observation from 'reduced' column
#' Function to generate a vector of one Accession number per distinct
#' observation from 'reduced' column
#'
#' @author Samuel Chen, Janani Ravi
#'
Expand All @@ -562,14 +599,16 @@ acc2FA <- function(accessions, outpath, plan = "sequential") {
#' @importFrom dplyr filter pull
#' @importFrom rlang sym
#'
#' @return
#' @return A character vector containing one Accession number per distinct
#' observation from the specified reduced column.
#' @export
#'
#' @examples
createRepresentativeAccNum <- function(prot_data,
reduced = "Lineage",
accnum_col = "AccNum") {
# Get Unique reduced column and then bind the AccNums back to get one AccNum per reduced column
# Get Unique reduced column and then bind the AccNums back to get one
# AccNum per reduced column
reduced_sym <- sym(reduced)
accnum_sym <- sym(accnum_col)

Expand Down Expand Up @@ -603,8 +642,10 @@ createRepresentativeAccNum <- function(prot_data,
#' @author Samuel Chen, Janani Ravi
#'
#' @param fasta_file Path to the FASTA file to be aligned
#' @param tool Type of alignment tool to use. One of three options: "Muscle", "ClustalO", or "ClustalW"
#' @param outpath Path to write the resulting alignment to as a FASTA file. If NULL, no file is written
#' @param tool Type of alignment tool to use. One of three options: "Muscle",
#' "ClustalO", or "ClustalW"
#' @param outpath Path to write the resulting alignment to as a FASTA file.
#' If NULL, no file is written
#'
#' @importFrom Biostrings readAAStringSet
#' @importFrom msa msaClustalOmega msaMuscle msaClustalW
Expand All @@ -613,6 +654,10 @@ createRepresentativeAccNum <- function(prot_data,
#' @export
#'
#' @examples
#' \dontrun{
#' aligned_sequences <- alignFasta("my_sequences.fasta",
#' tool = "Muscle", outpath = "aligned_output.fasta")
#' }
alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) {
fasta <- readAAStringSet(fasta_file)

Expand Down Expand Up @@ -641,7 +686,7 @@ alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) {
#' @importFrom Biostrings toString unmasked
#' @importFrom readr write_file
#'
#' @return
#' @return Character string representing the content of the written FASTA file.
#' @export
#'
#' @examples
Expand All @@ -660,11 +705,12 @@ writeMSA_AA2FA <- function(alignment, outpath) {

#' getAccNumFromFA
#'
#' @param fasta_file
#' @param fasta_file Character. The path to the FASTA file from which
#' accession numbers will be extracted.
#'
#' @importFrom stringi stri_extract_all_regex
#'
#' @return
#' @return A character vector containing the extracted accession numbers.
#' @export
#'
#' @examples
Expand Down
48 changes: 31 additions & 17 deletions R/blastWrappers.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,22 @@
#' Run DELTABLAST to find homologs for proteins of interest
#'
#' @author Samuel Chen, Janani Ravi
#' @description
#' This function executes a Delta-BLAST search using the specified parameters
#' and database. It sets the BLAST database path, runs the Delta-BLAST command
#' with the given query, and outputs the results.
#'
#' @param deltablast_path
#' @param db_search_path Path to the BLAST databases
#' @param db
#' @param query
#' @param evalue
#' @param out
#' @param num_alignments
#' @param num_threads
#' @param deltablast_path Path to the Delta-BLAST executable.
#' @param db_search_path Path to the BLAST databases.
#' @param db Name of the BLAST database to search against (default is "refseq").
#' @param query Path to the input query file.
#' @param evalue E-value threshold for reporting matches (default is "1e-5").
#' @param out Path to the output file where results will be saved.
#' @param num_alignments Number of alignments to report.
#' @param num_threads Number of threads to use for the search (default is 1).
#'
#' @return
#' @return This function does not return a value; it outputs results to the
#' specified file.
#' @export
#'
#' @examples
Expand Down Expand Up @@ -43,18 +48,27 @@ runDeltaBlast <- function(deltablast_path, db_search_path,

#' Run RPSBLAST to generate domain architectures for proteins of interest
#'
#' @param rpsblast_path
#' @param db_search_path Path to the BLAST databases
#' @param db
#' @param query
#' @param evalue
#' @param out
#' @param num_threads
#' @description
#' This function executes an RPS-BLAST search to generate domain architectures
#' for specified proteins. It sets the BLAST database path, runs the RPS-BLAST
#' command with the provided query, and outputs the results.
#'
#' @return
#' @param rpsblast_path Path to the RPS-BLAST executable.
#' @param db_search_path Path to the BLAST databases.
#' @param db Name of the BLAST database to search against (default is "refseq").
#' @param query Path to the input query file.
#' @param evalue E-value threshold for reporting matches (default is "1e-5").
#' @param out Path to the output file where results will be saved.
#' @param num_threads Number of threads to use for the search (default is 1).
#'
#' @return This function does not return a value; it outputs results to the
#' specified file.
#' @export
#'
#' @examples
#' \dontrun{
#' runRSPBlast(rpsblast_path, db_search_path, query, out)
#' }
runRPSBlast <- function(rpsblast_path, db_search_path,
db = "refseq", query, evalue = "1e-5",
out, num_threads = 1) {
Expand Down
Loading