diff --git a/R/tree.R b/R/tree.R index 5cdc20d1..596c4503 100755 --- a/R/tree.R +++ b/R/tree.R @@ -31,73 +31,278 @@ ## Functions ## ############### -########################### -## Approach 0 | FastTree2.0 -########################### -## !! FastTree will only work if there are unique sequence names!! -#' convertFA2Tree +############################## +## Approach 0-4 +############ +#' Approach 0: !! FastTree will only work if there are unique sequence names!! +#' createFA2Tree +#' @author Klangina, Outreachy Applicant +#' @author Janani Ravi, MolEcologist +#' @keywords phylogenetic tree, alignment, fasta, FastTree +#' @description +#' Generates a phylogenetic tree from an alignment file '.fa' using either FastTree or a detailed phylogenetic analysis approach. #' -#' @param fa_path Path to the input FASTA alignment file (.fa). Default is the -#' path to "data/alns/pspa_snf7.fa". -#' @param tre_path Path to the output file where the generated tree (.tre) will -#' be saved. Default is the path to "data/alns/pspa_snf7.tre". -#' @param fasttree_path Path to the FastTree executable, which is used to -#' generate the phylogenetic tree. Default is "src/FastTree". +#' @param fa_file Character. Path to the alignment FASTA file (.fa) from which +#' the phylogenetic tree will be generated. Default is "data/alns/pspa_snf7.fa". +#' @param out_file Character. Path to the output file where the generated tree (.tre) will +#' be saved. Default is "data/alns/pspa_snf7.tre". +#' @param enable_fast Logical. If TRUE, uses FastTree approach; if FALSE, uses detailed phylogenetic analysis. Default is TRUE. +#' @param fasttree_path Character. Path to the FastTree executable. Default is "src/FastTree". +#' @param format Character. Format of the input file. Default is "fasta". +#' @param f_seq_type Character. Type of fasta sequence, DNA or protein. Default is "protein". Choices are "protein","dna". +#' @param seq_type Character. Type of sequences in the input file. Default is "AA" (amino acid). +#' @param dist_matrix Character. Distance matrix to be used. Default is "similarity". +#' @param subset_size Numeric. Number of sequences to subset for analysis. Default is 10. +#' @param model_test Character. Model to use for model testing. Default is "all". +#' @param dna_model Character. Model to use for DNA distance calculation. Default is "JC69". +#' @param ml_model Character. Model to use for maximum likelihood estimation. Default is "JC". +#' @param rearrangement Character. Type of tree rearrangement for ML optimization. Default is "stochastic". +#' @param bootstrap_reps Numeric. Number of bootstrap replicates. Default is 100. +#' @param bootstrap_p Numeric. P-value threshold for bootstrap support. Default is 50. +#' @param plot_type Character. Type of plot for bootstrap tree. Default is "p". +#' @param bootstrap_file Character. Path to save the bootstrap tree file. Default is "bootstrap_example.tre". +#' +#' @importFrom ape write.tree +#' @importFrom phangorn bootstrap.pml dist.ml NJ modelTest phyDat plotBS pml pml.control pratchet optim.parsimony optim.pml read.phyDat upgma midpoint +#' @importFrom seqinr dist.alignment read.alignment +#' @importFrom stats logLik #' -#' @return No return value. The function generates a tree file (.tre) from the -#' input FASTA file. +#' @return No return value. The function generates phylogenetic tree files +#' (.tre) based on either FastTree or detailed phylogenetic analysis approaches. +#' It also produces various plots and console outputs when using the detailed approach. #' @export #' +#' @details The function offers two main approaches for phylogenetic tree construction: +#' 1. FastTree approach: A quick method using the FastTree program. +#' 2. Detailed phylogenetic analysis: Includes model testing, distance matrix calculation, +#' UPGMA and NJ tree construction, parsimony searches, and maximum likelihood estimation +#' with bootstrapping. +#' The alignment file should be in FASTA format with sequences properly aligned. +#' When using FastTree, ensure that the FastTree executable is correctly installed and +#' the path is properly specified. +#' +#' @note This function requires the ape, phangorn, and seqinr packages to be installed. +#' The detailed analysis approach performs computationally intensive operations and may +#' take a significant amount of time to run, especially for large datasets or with high +#' numbers of bootstrap replicates. FastTree approach is significantly faster but may +#' provide less detailed results. +#' #' @examples #' \dontrun{ -#' convert_fa2tre(here("data/alns/pspa_snf7.fa"), -#' here("data/alns/pspa_snf7.tre"), -#' here("src/FastTree") +#' # Using FastTree approach +#' createFA2Tree("data/alns/my_alignment.fa", "data/trees/my_tree.tre", enable_fast = TRUE) +#' +#' # Using detailed phylogenetic analysis approach +#' createFA2Tree("data/alns/protein_align.fa", "data/trees/protein_tree.tre", +#' enable_fast = FALSE, subset_size = 20, bootstrap_reps = 500) #' } -convertFA2Tree <- function(fa_path = here("data/alns/pspa_snf7.fa"), - tre_path = here("data/alns/pspa_snf7.tre"), - fasttree_path = here("src/FastTree")) { - # fa_path=here("data/alns/pspa_snf7.fa") - # tre_path=here("data/alns/pspa_snf7.tre") - # fasttree_path=here("src/FastTree") - print(fa_path) - system2( - command = fasttree_path, - args = paste(c(fa_path, ">", tre_path), - sep = "", collapse = " " +createFA2Tree <- function( + fa_file = "data/alns/pspa_snf7.fa", + out_file = "data/alns/pspa_snf7.tre", + enable_fast = TRUE, + fasttree_path = "src/FastTree", + format = "fasta", + f_seq_type = "protein", + seq_type = "AA", + dist_matrix = "similarity", + subset_size = 10, + model_test = "all", + dna_model = "JC69", + ml_model = "JC", + rearrangement = "stochastic", + bootstrap_reps = 100, + bootstrap_p = 50, + plot_type = "p", + bootstrap_file = "bootstrap_example.tre" +) { + ## SAMPLE ARGS + # fa_file="data/alns/pspa_snf7.fa" + # out_file="data/alns/pspa_snf7.tre" + # fasttree_path="src/FastTree" + + if (enable_fast) { + # FastTree approach + print(fa_file) + system2( + command = fasttree_path, + args = paste(c(fa_file, ">", out_file), + sep = "", collapse = " " + ) ) - ) - ################################ - ## Compiling FastTree.c on a Mac - ################################ - ## Other platforms? Check http://www.microbesonline.org/fasttree/ - # system(paste("gcc -DNO_SSE -O3 -finline-functions -funroll-loops -Wall -o", - # here("src/FastTree"), - # here("src/FastTree.c"), "-lm", collapse=" ")) + ################################ + ## Compiling FastTree.c on a Mac + ################################ + ## Other platforms? Check http://www.microbesonline.org/fasttree/ + # system(paste("gcc -DNO_SSE -O3 -finline-functions -funroll-loops -Wall -o", + # here("src/FastTree"), + # here("src/FastTree.c"), "-lm", collapse=" ")) + } else { + # Detailed phylogenetic analysis approach + ########################### + ## Approach 1 + ########################### + ## https://a-little-book-of-r-for-bioinformatics.readthedocs.io/en/latest/src/chapter5.html + prot_aln <- seqinr::read.alignment(file = fa_file, format = format) + prot_aln$seq + prot_dist <- dist.alignment(prot_aln, matrix = dist_matrix) + prot_nj <- NJ(prot_dist) + prot_nj_tree <- plot(prot_nj, main = "Neighbor Joining") + write.tree(phy = prot_nj_tree, file = out_file) + + ########################### + ## Approach 2 + ########################### + ## Alignment file formats and conversion + # read in sequence data, convert to phyDat + prot_fa <- read.phyDat(fa_file, format = format, type = seq_type) + prot_phyDat <- phyDat(prot_fa, type = seq_type, levels = NULL) + prot_subset <- subset(prot_phyDat, 1:subset_size) + prot_subset_phyDat <- phyDat(prot_subset, type = seq_type, levels = NULL) + + ## Model Testing & Distance Matrices + ## Comparison of different nucleotide or amino acid substitution models + mt <- modelTest(prot_subset, model = model_test) + print(mt) + + # estimate a distance matrix using a Jules-Cantor Model + if(f_seq_type=="protein"){ + seq_dist <- dist.ml(prot_subset, model = mt) + } else if(f_seq_type=="dna") { + seq_dist <- dist.ml(prot_subset, model = dna_model) + } else { + stop("Error: The f_seq_type must be one of ['protein','dna'].") + } + + ## Neighbor Joining, UPGMA, and Maximum Parsimony + # quick and dirty UPGMA and NJ trees + prot_UPGMA <- upgma(seq_dist) + prot_NJ <- NJ(seq_dist) + plot(prot_UPGMA, main = "UPGMA") + plot(prot_NJ, main = "Neighbor Joining") + + # parsimony searches + prot_optim <- optim.parsimony(prot_NJ, prot_phyDat) + prot_pratchet <- pratchet(prot_subset) # returning error + plot(prot_optim) + plot(prot_pratchet) + + ## Maximum likelihood and Bootstrapping + # ml estimation w/ distance matrix + fit <- pml(prot_NJ, prot_subset) + print(fit) + if(f_seq_type=="protein"){ + fit_optimized <- optim.pml(fit, model = fit, rearrangement = rearrangement) + } else if(f_seq_type=="dna") { + fit_optimized <- optim.pml(fit, model = ml_model, rearrangement = rearrangement) + } else { + stop("Error: The f_seq_type must be one of ['protein','dna'].") + } + logLik(fit_optimized) + bs <- bootstrap.pml(fit_optimized, + bs = bootstrap_reps, optNni = TRUE, multicore = TRUE, + control = pml.control(trace = 0) + ) + plotBS(phangorn::midpoint(fit_optimized$tree), bs, p = bootstrap_p, type = plot_type) + + # subsetted alignment bs example + prot_subset_dm <- dist.ml(prot_subset) + prot_subset_NJ <- NJ(prot_subset_dm) + fit2 <- pml(prot_subset_NJ, data = prot_subset) + print(fit2) + fit_optimized2 <- NULL + if(f_seq_type=="protein"){ + fit_optimized2 <- optim.pml(fit2, model = fit2, rearrangement = rearrangement) + } else if(f_seq_type=="dna") { + fit_optimized2 <- optim.pml(fit2, model = ml_model, rearrangement = rearrangement) + } else { + stop("Error: The f_seq_type must be one of ['protein','dna'].") + } + logLik(fit_optimized2) + bs_subset <- bootstrap.pml(fit_optimized2, + bs = bootstrap_reps, optNni = TRUE, multicore = TRUE, + control = pml.control(trace = 0) + ) + bs2 <- plotBS(phangorn::midpoint(fit_optimized2$tree), bs_subset, p = bootstrap_p, type = plot_type) + + ## Exporting Trees + write.tree(bs2, file = bootstrap_file) + } } -## Generate Trees for ALL fasta files in "data/alns" #' convertAlignment2Trees -#' +#' @author Klangina, Outreachy Applicant #' @description -#' Generate Trees for ALL fasta files in "data/alns" +#' Generate Trees for ALL fasta files in the specified directory. This function processes +#' multiple alignment files and creates phylogenetic trees for each, using either FastTree +#' or a detailed phylogenetic analysis approach. #' -#' @param aln_path Path to the directory containing all the alignment FASTA -#' files (.fa) for which trees will be generated. Default is "data/alns/". -#' +#' @param aln_path Character. Path to the directory containing all the alignment FASTA +#' files (.fa) for which trees will be generated. Default is here("data/alns/"). +#' @param enable_fast Logical. If TRUE, uses FastTree approach; if FALSE, uses detailed +#' phylogenetic analysis. Default is TRUE. +#' @param fasttree_path Character. Path to the FastTree executable. Default is here("src/FastTree"). +#' @param format Character. Format of the input files. Default is "fasta". +#' @param f_seq_type Character. Type of fasta sequence, DNA or protein. Default is "protein". Choices are "protein","dna". +#' @param seq_type Character. Type of sequences in the input files. Default is "AA" (amino acid). +#' @param dist_matrix Character. Distance matrix to be used. Default is "similarity". +#' @param subset_size Numeric. Number of sequences to subset for analysis. Default is 10. +#' @param model_test Character. Model to use for model testing. Default is "all". +#' @param dna_model Character. Model to use for DNA distance calculation. Default is "JC69". +#' @param ml_model Character. Model to use for maximum likelihood estimation. Default is "JC". +#' @param rearrangement Character. Type of tree rearrangement for ML optimization. Default is "stochastic". +#' @param bootstrap_reps Numeric. Number of bootstrap replicates. Default is 100. +#' @param bootstrap_p Numeric. P-value threshold for bootstrap support. Default is 50. +#' @param plot_type Character. Type of plot for bootstrap tree. Default is "p". +#' @param bootstrap_file_suffix Character. Suffix for bootstrap tree files. Default is "_bootstrap.tre". #' #' @importFrom here here #' @importFrom purrr pmap #' @importFrom stringr str_replace_all #' -#' @return No return value. The function generates tree files (.tre) for each -#' alignment file in the specified directory. +#' @return No return value. The function generates tree files (.tre) and bootstrap files +#' for each alignment file in the specified directory. #' @export #' +#' @details This function processes all .fa files in the specified directory. For each file, +#' it generates a phylogenetic tree using either FastTree (if enable_fast is TRUE) or a +#' detailed phylogenetic analysis approach (if enable_fast is FALSE). The detailed approach +#' includes model testing, distance matrix calculation, UPGMA and NJ tree construction, +#' parsimony searches, and maximum likelihood estimation with bootstrapping. +#' +#' @note This function requires the ape, phangorn, and seqinr packages to be installed. +#' The detailed analysis approach performs computationally intensive operations and may +#' take a significant amount of time to run, especially for large datasets or with high +#' numbers of bootstrap replicates. FastTree approach is significantly faster but may +#' provide less detailed results. +#' #' @examples #' \dontrun{ -#' generate_trees(here("data/alns/")) +#' # Using FastTree approach for all alignments in the default directory +#' convertAlignment2Trees() +#' +#' # Using detailed phylogenetic analysis for alignments in a specific directory +#' convertAlignment2Trees(aln_path = "path/to/alignments/", +#' enable_fast = FALSE, +#' subset_size = 20, +#' bootstrap_reps = 500) #' } -convertAlignment2Trees <- function(aln_path = here("data/alns/")) { +convertAlignment2Trees <- function( + aln_path = here("data/alns/"), + enable_fast = TRUE, + fasttree_path = here("src/FastTree"), + format = "fasta", + f_seq_type = "protein", + seq_type = "AA", + dist_matrix = "similarity", + subset_size = 10, + model_test = "all", + dna_model = "JC69", + ml_model = "JC", + rearrangement = "stochastic", + bootstrap_reps = 100, + bootstrap_p = 50, + plot_type = "p", + bootstrap_file_suffix = "_bootstrap.tre" +) { # finding all fasta alignment files fa_filenames <- list.files(path = aln_path, pattern = "*.fa") fa_paths <- paste0(aln_path, fa_filenames) @@ -107,124 +312,27 @@ convertAlignment2Trees <- function(aln_path = here("data/alns/")) { ## Using purrr::pmap to generate trees for each of the fa files! fa2tre_args <- list( - fa_path = fa_paths, - tre_path = paste0(aln_path, variable, ".tre") + fa_file = fa_paths, + out_file = paste0(aln_path, variable, ".tre"), + enable_fast = enable_fast, + fasttree_path = fasttree_path, + format = format, + f_seq_type = f_seq_type, + seq_type = seq_type, + dist_matrix = dist_matrix, + subset_size = subset_size, + model_test = model_test, + dna_model = dna_model, + ml_model = ml_model, + rearrangement = rearrangement, + bootstrap_reps = bootstrap_reps, + bootstrap_p = bootstrap_p, + plot_type = plot_type, + bootstrap_file = paste0(aln_path, variable, bootstrap_file_suffix) ) + pmap( - .l = fa2tre_args, .f = convertFA2Tree, - fasttree_path = here("src/FastTree") - ) -} - -############################## -## REFS: 1-4 -############ -#' createFA2Tree -#' -#' @author Janani Ravi, MolEcologist -#' @keywords phylogenetic tree, alignment, fasta -#' @description -#' Generating phylogenetic tree from alignment file '.fa' -#' -#' @param fa_file Character. Path to the alignment FASTA file (.fa) from which -#' the phylogenetic tree will be generated. Default is 'pspa_snf7.fa'. -#' @param out_file Path to the output file where the generated tree (.tre) will -#' be saved. Default is "data/alns/pspa_snf7.tre". -#' -#' @importFrom ape write.tree -#' @importFrom phangorn bootstrap.pml dist.ml NJ modelTest phyDat plotBS pml pml.control pratchet optim.parsimony optim.pml read.phyDat upgma -#' @importFrom seqinr dist.alignment read.alignment -#' @importFrom stats logLik -#' -#' @return No return value. The function generates a phylogenetic tree file -#' (.tre) based on different approaches like Neighbor Joining, UPGMA, and -#' Maximum Likelihood. -#' @export -#' -#' @details The alignment file would need two columns: 1. accession + -#' number and 2. alignment. The protein homolog accession to lineage mapping + -#' file should have -#' -#' @note Please refer to the source code if you have alternate + -#' file formats and/or column names. -#' -#' @examples -#' \dontrun{ -#' generate_aln2tree("pspa_snf7.fa") -#' } -createFA2Tree <- function(fa_file = "data/alns/pspa_snf7.fa", - out_file = "data/alns/pspa_snf7.tre") { - ## SAMPLE ARGS - # fa_file="data/alns/pspa_snf7.fa" - # out_file="data/alns/pspa_snf7.tre" - - ########################### - ## Approach 1 - ########################### - ## https://a-little-book-of-r-for-bioinformatics.readthedocs.io/en/latest/src/chapter5.html - prot_aln <- read.alignment(file = fa_file, format = "fasta") - prot_aln$seq - prot_dist <- dist.alignment(prot_aln, matrix = "similarity") - prot_nj <- NJ(prot_dist) - prot_nj_tree <- plot(prot_nj, main = "Neighbor Joining") - write.tree(phy = prot_nj_tree, file = tre_path) - - ########################### - ## Approach 2 - ########################### - ## Alignment file formats and conversion - # read in sequence data, convert to phyDat - prot_fa <- read.phyDat(fa_file, format = "fasta", type = "AA") - prot_phyDat <- phyDat(prot_fa, type = "AA", levels = NULL) - prot10 <- subset(prot_phyDat, 1:10) - prot10_phyDat <- phyDat(prot10, type = "AA", levels = NULL) - - ## Model Testing & Distance Matrices - ## Comparison of different nucleotide or amino acid substitution models - mt <- modelTest(prot10, model = "all") - print(mt) - - # estimate a distance matrix using a Jules-Cantor Model - dna_dist <- dist.ml(prot10, model = "JC69") - - ## Neighbor Joining, UPGMA, and Maximum Parsimony - # quick and dirty UPGMA and NJ trees - prot_UPGMA <- upgma(dna_dist) - prot_NJ <- NJ(dna_dist) - plot(prot_UPGMA, main = "UPGMA") - plot(prot_NJ, main = "Neighbor Joining") - - # parsimony searches - prot_optim <- optim.parsimony(prot_NJ, prot) - prot_pratchet <- pratchet(prot10) # returning error - plot(prot_optim) - plot(prot_pratchet) - - ## Maximum likelihood and Bootstrapping - # ml estimation w/ distance matrix - fit <- pml(prot_NJ, prot10) - print(fit) - fitJC <- optim.pml(fit, model = "JC", rearrangement = "stochastic") - logLik(fitJC) - bs <- bootstrap.pml(fitJC, - bs = 100, optNni = TRUE, multicore = TRUE, - control = pml.control(trace = 0) + .l = fa2tre_args, + .f = createFA2Tree ) - plotBS(midpoint(fitJC$tree), bs, p = 50, type = "p") - - # subsetted alignment bs example - prot10_dm <- dist.ml(prot10) - prot10_NJ <- NJ(prot10_dm) - fit2 <- pml(prot10_NJ, data = prot10) - print(fit2) - fitJC2 <- optim.pml(fit2, model = "JC", rearrangement = "stochastic") - logLik(fitJC2) - bs_subset <- bootstrap.pml(fitJC2, - bs = 100, optNni = TRUE, multicore = TRUE, - control = pml.control(trace = 0) - ) - bs2 <- plotBS(midpoint(fitJC2$tree), bs, p = 50, type = "p") - - ## Exporting Trees - write.tree(bs2, file = "bootstrap_example.tre") -} +} \ No newline at end of file