Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ export(GCA2Lineage)
export(IPG2Lineage)
export(acc2FA)
export(acc2Lineage)
export(acc2fa)
export(addLeaves2Alignment)
export(addLineage)
export(addName)
Expand Down
121 changes: 88 additions & 33 deletions R/CHANGED-pre-msa-tree.R
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,14 @@ api_key <- Sys.getenv("ENTREZ_API_KEY", unset = "YOUR_KEY_HERE")
#' @param y Delimitter. Default is space (" ").
#' @seealso chartr, toupper, and tolower.
#'
#' @return
#' @return Character vector with the input strings converted to title case.
#'
#' @export
#'
#' @examples
#' # Convert a single string to title case
#' convert2TitleCase("hello world") # Returns "Hello World"
#'
convert2TitleCase <- function(x, y = " ") {
s <- strsplit(x, y)[[1]]
paste(toupper(substring(s, 1, 1)), substring(s, 2),
Expand Down Expand Up @@ -76,7 +80,8 @@ convert2TitleCase <- function(x, y = " ") {
#' @importFrom stringr str_sub
#' @importFrom tidyr replace_na separate
#'
#' @return
#' @return A data frame containing the enriched alignment data with lineage
#' information.
#'
#' @details The alignment file would need two columns: 1. accession +
#' number and 2. alignment. The protein homolog accession to lineage mapping +
Expand Down Expand Up @@ -203,6 +208,14 @@ addLeaves2Alignment <- function(aln_file = "",
#' @export
#'
#' @examples
#' # Example usage of the addName function
#' data <- data.frame(
#' AccNum = c("ACC123", "ACC456"),
#' Species = c("Homo sapiens", "Mus musculus"),
#' Lineage = c("Eukaryota>Chordata", "Eukaryota>Chordata")
#' )
#' enriched_data <- addName(data)
#' enriched_data
addName <- function(data,
accnum_col = "AccNum", spec_col = "Species", lin_col = "Lineage",
lin_sep = ">", out_col = "Name") {
Expand Down Expand Up @@ -278,7 +291,9 @@ addName <- function(data,
#' @note Please refer to the source code if you have alternate +
#' file formats and/or column names.
#'
#' @return
#' @return A character string representing the FASTA formatted sequences.
#' If `fa_outpath` is provided, the FASTA will also be saved to the specified
#' file.
#' @export
#'
#' @examples
Expand Down Expand Up @@ -321,23 +336,29 @@ convertAlignment2FA <- function(aln_file = "",
}

#' mapAcc2Name
#'
#'
#' @description
#' Default renameFA() replacement function. Maps an accession number to its name
#'
#' @param line The line of a fasta file starting with '>'
#' @param acc2name Data Table containing a column of accession numbers and a name column
#' @param acc2name Data Table containing a column of accession numbers and a
#' name column
#' @param acc_col Name of the column containing Accession numbers
#' @param name_col Name of the column containing the names that the accession numbers
#' @param name_col Name of the column containing the names that the accession
#' numbers
#' are mapped to
#'
#' @importFrom dplyr filter pull
#' @importFrom rlang sym
#'
#' @return
#' @return A character string representing the updated FASTA line, where the
#' accession number is replaced with its corresponding name.
#' @export
#'
#' @examples
#' \dontrun{
#' mapAcc2Name(">P12345 some description", acc2name, "AccNum", "Name")
#' }
mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") {
# change to be the name equivalent to an addNames column
# Find the first ' '
Expand All @@ -363,10 +384,14 @@ mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") {
#' @importFrom purrr map
#' @importFrom readr read_lines write_lines
#'
#' @return
#' @return A character vector of the modified lines in the FASTA file.
#' @export
#'
#' @examples
#' \dontrun{
#' renameFA("path/to/input.fasta",
#' "path/to/output.fasta", mapAcc2Name, acc2name)
#' }
renameFA <- function(fa_path, outpath,
replacement_function = mapAcc2Name, ...) {
lines <- read_lines(fa_path)
Expand All @@ -386,29 +411,35 @@ renameFA <- function(fa_path, outpath,
################################
## generateAllAlignments2FA
#' generateAllAlignments2FA
#'
#' @description
#'
#' @description
#' Adding Leaves to an alignment file w/ accessions
#'
#' @keywords alignment, accnum, leaves, lineage, species
#' @description Adding Leaves to all alignment files w/ accessions & DAs?
#'
#' @param aln_path Character. Path to alignment files.
#' Default is 'here("data/rawdata_aln/")'
#' @param fa_outpath Character. Path to file. Master protein file with AccNum & lineages.
#' @param fa_outpath Character. Path to file. Master protein file with AccNum &
#' lineages.
#' Default is 'here("data/rawdata_tsv/all_semiclean.txt")'
#' @param lin_file Character. Path to the written fasta file.
#' Default is 'here("data/alns/")'.
#' @param reduced Boolean. If TRUE, the fasta file will contain only one sequence per lineage.
#' @param reduced Boolean. If TRUE, the fasta file will contain only one
#' sequence per lineage.
#' Default is 'FALSE'.
#'
#' @importFrom purrr pmap
#' @importFrom stringr str_replace_all
#'
#' @return
#' @return NULL. The function saves the output FASTA files to the specified
#' directory.
#'
#' @details The alignment files would need two columns separated by spaces: 1. AccNum and 2. alignment. The protein homolog file should have AccNum, Species, Lineages.
#' @note Please refer to the source code if you have alternate + file formats and/or column names.
#' @details The alignment files would need two columns separated by spaces:
#' 1. AccNum and 2. alignment. The protein homolog file should have AccNum,
#' Species, Lineages.
#' @note Please refer to the source code if you have alternate + file formats
#' and/or column names.
#'
#' @export
#'
Expand Down Expand Up @@ -447,33 +478,38 @@ generateAllAlignments2FA <- function(aln_path = here("data/rawdata_aln/"),

# accessions <- c("P12345","Q9UHC1","O15530","Q14624","P0DTD1")
# accessions <- rep("ANY95992.1", 201)
#' acc2FA
#' acc2FA
#'
#' @description
#' converts protein accession numbers to a fasta format. Resulting
#' converts protein accession numbers to a fasta format. Resulting
#' fasta file is written to the outpath.
#'
#' @author Samuel Chen, Janani Ravi
#' @keywords accnum, fasta
#'
#' @param accessions Character vector containing protein accession numbers to generate fasta sequences for.
#' @param accessions Character vector containing protein accession numbers to
#' generate fasta sequences for.
#' Function may not work for vectors of length > 10,000
#' @param outpath [str] Location where fasta file should be written to.
#' @param plan
#' @param plan Character string specifying the parallel processing strategy to
#' use with the `future` package. Default is "sequential".
#'
#' @importFrom Biostrings readAAStringSet
#' @importFrom future future plan value
#' @importFrom purrr map
#' @importFrom rentrez entrez_fetch
#'
#' @return
#' @return A logical value indicating whether the retrieval and conversion were
#' successful. Returns `TRUE` if successful and `FALSE` otherwise.
#' @export
#'
#' @examples
#' \dontrun{
#' acc2FA(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"), outpath = "my_proteins.fasta")
#' acc2FA(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"),
#' outpath = "my_proteins.fasta")
#' Entrez:accessions <- rep("ANY95992.1", 201) |> acc2FA(outpath = "entrez.fa")
#' EBI:accessions <- c("P12345", "Q9UHC1", "O15530", "Q14624", "P0DTD1") |> acc2FA(outpath = "ebi.fa")
#' EBI:accessions <- c("P12345", "Q9UHC1",
#' "O15530", "Q14624", "P0DTD1") |> acc2FA(outpath = "ebi.fa")
#' }
acc2FA <- function(accessions, outpath, plan = "sequential") {
# validation
Expand Down Expand Up @@ -547,9 +583,10 @@ acc2FA <- function(accessions, outpath, plan = "sequential") {
}

#' createRepresentativeAccNum
#'
#'
#' @description
#' Function to generate a vector of one Accession number per distinct observation from 'reduced' column
#' Function to generate a vector of one Accession number per distinct
#' observation from 'reduced' column
#'
#' @author Samuel Chen, Janani Ravi
#'
Expand All @@ -562,14 +599,19 @@ acc2FA <- function(accessions, outpath, plan = "sequential") {
#' @importFrom dplyr filter pull
#' @importFrom rlang sym
#'
#' @return
#' @return A character vector containing one Accession number per distinct
#' observation from the specified reduced column.
#' @export
#'
#' @examples
#' \dontrun{
#' createRepresentativeAccNum(prot)
#' }
createRepresentativeAccNum <- function(prot_data,
reduced = "Lineage",
accnum_col = "AccNum") {
# Get Unique reduced column and then bind the AccNums back to get one AccNum per reduced column
# Get Unique reduced column and then bind the AccNums back to get one
# AccNum per reduced column
reduced_sym <- sym(reduced)
accnum_sym <- sym(accnum_col)

Expand All @@ -596,15 +638,17 @@ createRepresentativeAccNum <- function(prot_data,
}

#' alignFasta
#'
#'
#' @description
#' Perform a Multiple Sequence Alignment on a FASTA file.
#'
#' @author Samuel Chen, Janani Ravi
#'
#' @param fasta_file Path to the FASTA file to be aligned
#' @param tool Type of alignment tool to use. One of three options: "Muscle", "ClustalO", or "ClustalW"
#' @param outpath Path to write the resulting alignment to as a FASTA file. If NULL, no file is written
#' @param tool Type of alignment tool to use. One of three options: "Muscle",
#' "ClustalO", or "ClustalW"
#' @param outpath Path to write the resulting alignment to as a FASTA file.
#' If NULL, no file is written
#'
#' @importFrom Biostrings readAAStringSet
#' @importFrom msa msaClustalOmega msaMuscle msaClustalW
Expand All @@ -613,6 +657,10 @@ createRepresentativeAccNum <- function(prot_data,
#' @export
#'
#' @examples
#' \dontrun{
#' aligned_sequences <- alignFasta("my_sequences.fasta",
#' tool = "Muscle", outpath = "aligned_output.fasta")
#' }
alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) {
fasta <- readAAStringSet(fasta_file)

Expand Down Expand Up @@ -641,11 +689,14 @@ alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) {
#' @importFrom Biostrings toString unmasked
#' @importFrom readr write_file
#'
#' @return
#' @return Character string representing the content of the written FASTA file.
#' @export
#'
#' @examples
writeMSA_AA2FA <- function(alignment, outpath) {
#' \dontrun{
#' writeMSA_AA2FA("my_sequences.fasta", outpath = "aligned_output.fasta")
#' }
writeMSA_AA2FA <- function(writeMSA_AA2FA, outpath) {
l <- length(rownames(alignment))
fasta <- ""
for (i in 1:l)
Expand All @@ -660,14 +711,18 @@ writeMSA_AA2FA <- function(alignment, outpath) {

#' getAccNumFromFA
#'
#' @param fasta_file
#' @param fasta_file Character. The path to the FASTA file from which
#' accession numbers will be extracted.
#'
#' @importFrom stringi stri_extract_all_regex
#'
#' @return
#' @return A character vector containing the extracted accession numbers.
#' @export
#'
#' @examples
#' \dontrun{
#' getAccNumFromFA("my_sequences.fasta")
#' }
getAccNumFromFA <- function(fasta_file) {
txt <- read_file(fasta_file)
accnums <- stringi::stri_extract_all_regex(fasta_file, "(?<=>)[\\w,.]+")[[1]]
Expand Down
53 changes: 35 additions & 18 deletions R/blastWrappers.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,29 @@
#' Run DELTABLAST to find homologs for proteins of interest
#'
#' @author Samuel Chen, Janani Ravi
#' @description
#' This function executes a Delta-BLAST search using the specified parameters
#' and database. It sets the BLAST database path, runs the Delta-BLAST command
#' with the given query, and outputs the results.
#'
#' @param deltablast_path
#' @param db_search_path Path to the BLAST databases
#' @param db
#' @param query
#' @param evalue
#' @param out
#' @param num_alignments
#' @param num_threads
#' @param deltablast_path Path to the Delta-BLAST executable.
#' @param db_search_path Path to the BLAST databases.
#' @param db Name of the BLAST database to search against (default is "refseq").
#' @param query Path to the input query file.
#' @param evalue E-value threshold for reporting matches (default is "1e-5").
#' @param out Path to the output file where results will be saved.
#' @param num_alignments Number of alignments to report.
#' @param num_threads Number of threads to use for the search (default is 1).
#'
#' @return
#' @return This function does not return a value; it outputs results to the
#' specified file.
#' @export
#'
#' @examples
runDeltaBlast <- function(deltablast_path, db_search_path,
#' \dontrun{
#' runDeltaBlast(runDeltaBlast, db_search_path)
#' }
runDeltaBlast <- function(runDeltaBlast, db_search_path,
db = "refseq", query, evalue = "1e-5",
out, num_alignments, num_threads = 1) {
start <- Sys.time()
Expand All @@ -43,18 +51,27 @@ runDeltaBlast <- function(deltablast_path, db_search_path,

#' Run RPSBLAST to generate domain architectures for proteins of interest
#'
#' @param rpsblast_path
#' @param db_search_path Path to the BLAST databases
#' @param db
#' @param query
#' @param evalue
#' @param out
#' @param num_threads
#' @description
#' This function executes an RPS-BLAST search to generate domain architectures
#' for specified proteins. It sets the BLAST database path, runs the RPS-BLAST
#' command with the provided query, and outputs the results.
#'
#' @return
#' @param rpsblast_path Path to the RPS-BLAST executable.
#' @param db_search_path Path to the BLAST databases.
#' @param db Name of the BLAST database to search against (default is "refseq").
#' @param query Path to the input query file.
#' @param evalue E-value threshold for reporting matches (default is "1e-5").
#' @param out Path to the output file where results will be saved.
#' @param num_threads Number of threads to use for the search (default is 1).
#'
#' @return This function does not return a value; it outputs results to the
#' specified file.
#' @export
#'
#' @examples
#' \dontrun{
#' runRSPBlast(rpsblast_path, db_search_path, query, out)
#' }
runRPSBlast <- function(rpsblast_path, db_search_path,
db = "refseq", query, evalue = "1e-5",
out, num_threads = 1) {
Expand Down
Loading