diff --git a/docs/build_docs.sh b/docs/build_docs.sh index ec9c83ca..0aefd331 100644 --- a/docs/build_docs.sh +++ b/docs/build_docs.sh @@ -1,11 +1,9 @@ rm -rf build conda env remove -n alphabasedocs conda create -n alphabasedocs python=3.9 -y -# conda create -n alphatimsinstaller python=3.9 + conda activate alphabasedocs -# call conda install git -y -# call pip install 'git+https://github.com/MannLabs/alphatims.git#egg=alphatims[gui]' --use-feature=2020-resolver -# brew install freetype + pip install '../.[development]' make html conda deactivate diff --git a/nbs_tests/match/psm_match.ipynb b/nbs_tests/match/psm_match.ipynb new file mode 100644 index 00000000..10599c6d --- /dev/null +++ b/nbs_tests/match/psm_match.ipynb @@ -0,0 +1,333 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#---#| default_exp match.psm_match" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Match" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Peak matching functionalities" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "from peptdeep.match.psm_match import PepSpecMatch\n", + "from alpharaw import register_all_readers\n", + "from alpharaw.ms_data_base import ms_reader_provider\n", + "\n", + "register_all_readers()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| hide\n", + "import io\n", + "import copy\n", + "\n", + "from alphabase.psm_reader.pfind_reader import pFindReader\n", + "from alphabase.peptide.fragment import create_fragment_mz_dataframe_by_sort_precursor\n", + "from alpharaw.legacy_msdata.mgf import MGFReader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 2/2 [00:01<00:00, 1.41it/s]\n" + ] + } + ], + "source": [ + "#| hide\n", + "#unittest\n", + "mgf = io.StringIO(\"\"\"\n", + "BEGIN IONS\n", + "TITLE=02445a_BA7-TUM_HLA_7_01_01-DDA-1h-R1.31809.31809.3.0.dta\n", + "CHARGE=3+\n", + "RTINSECONDS=0.5418930\n", + "PEPMASS=272.276336\n", + "103.92207 5457.3\n", + "104.20045 5051.4\n", + "108.70090 5891.7\n", + "113.94175 6442.6\n", + "116.92975 40506.3\n", + "116.93716 8945.5\n", + "128.37773 6427.8\n", + "131.95308 288352.6\n", + "133.93259 7344.6\n", + "138.44611 7326.1\n", + "139.00072 41556.8\n", + "140.00319 16738.8\n", + "140.99719 9493.8\n", + "145.93156 10209.3\n", + "145.94897 10497.8\n", + "147.94559 8206.3\n", + "147.96396 30552.8\n", + "148.95543 14654.7\n", + "149.96338 234207.8\n", + "150.95096 8306.0\n", + "157.01089 84638.9\n", + "158.01357 27925.7\n", + "159.00627 16084.7\n", + "163.94281 24751.1\n", + "163.95915 32203.3\n", + "165.95605 44458.0\n", + "165.97186 11530.2\n", + "166.99500 26432.2\n", + "167.97302 9216.7\n", + "181.95230 13858.8\n", + "191.95448 66152.7\n", + "192.95538 8408.9\n", + "193.07185 9092.8\n", + "193.95313 660574.9\n", + "194.95674 23452.8\n", + "194.99008 143940.9\n", + "200.00568 19510.8\n", + "200.99942 23678.7\n", + "204.30894 9406.1\n", + "209.96466 21853.6\n", + "211.96245 65351.0\n", + "218.90355 9149.6\n", + "223.91072 11300.2\n", + "238.89684 12108.8\n", + "243.93825 10150.2\n", + "243.97040 10987.7\n", + "244.94121 8744.2\n", + "246.90314 11556.3\n", + "271.93225 29430.0\n", + "271.99219 51184.4\n", + "272.19150 31960.4\n", + "272.98602 35844.1\n", + "273.94431 11031.8\n", + "284.47998 8191.3\n", + "290.00125 66212.4\n", + "290.99539 54064.7\n", + "293.89490 10005.0\n", + "407.06372 10838.2\n", + "464.36697 9715.4\n", + "633.40036 633.40036\n", + "698.81390 9711.7\n", + "707.301117 707.301117\n", + "END IONS\n", + "BEGIN IONS\n", + "TITLE=02445a_BA7-TUM_HLA_7_01_01-DDA-1h-R1.23862.23862.2.0.dta\n", + "CHARGE=2+\n", + "RTINSECONDS=0.6455220\n", + "PEPMASS=287.427959\n", + "103.34669 5304.0\n", + "104.66884 5639.7\n", + "113.42419 6258.3\n", + "118.84039 5837.5\n", + "119.93203 13977.3\n", + "130.69589 6876.2\n", + "133.94824 43094.3\n", + "134.30524 7671.5\n", + "135.96359 9031.3\n", + "138.99994 8329.7\n", + "146.95573 31143.9\n", + "147.96323 12176.5\n", + "150.95151 65859.3\n", + "151.95818 24384.2\n", + "157.01105 19241.5\n", + "157.34985 7532.5\n", + "161.08838 7843.9\n", + "161.94234 20119.7\n", + "162.95146 60110.4\n", + "163.95877 183305.5\n", + "164.96657 13647.5\n", + "174.95139 150331.9\n", + "175.95258 21393.4\n", + "178.94460 11433.1\n", + "179.95316 13650.5\n", + "180.96204 15353.5\n", + "190.94572 30418.9\n", + "191.95422 61914.1\n", + "192.61461 8642.1\n", + "192.94395 12331.4\n", + "192.96207 132342.5\n", + "193.96318 19303.0\n", + "209.04164 25149.6\n", + "209.96368 154185.0\n", + "209.98361 12353.5\n", + "213.86244 11541.3\n", + "224.93071 12903.0\n", + "228.92879 8773.6\n", + "241.86043 135357.5\n", + "242.86113 20805.2\n", + "242.94327 26679.4\n", + "243.95219 29569.9\n", + "244.92361 12153.5\n", + "246.90300 16650.3\n", + "252.96521 73484.3\n", + "253.96646 11527.5\n", + "286.85858 10166.4\n", + "287.94186 18763.2\n", + "303.87665 39189.3\n", + "304.88116 11976.0\n", + "321.89087 97122.5\n", + "322.88867 28020.8\n", + "370.28696 9008.2\n", + "389.82578 13277.0\n", + "407.83545 12220.4\n", + "425.84872 13236.5\n", + "482.54852 10940.2\n", + "END IONS\n", + "BEGIN IONS\n", + "TITLE=02445a_BA7-TUM_HLA_7_01_01-DDA-1h-R1.23431.23431.2.0.dta\n", + "CHARGE=2+\n", + "RTINSECONDS=0.6455220\n", + "PEPMASS=287.427959\n", + "103.34669 5304.0\n", + "104.66884 5639.7\n", + "END IONS\n", + "BEGIN IONS\n", + "TITLE=02445a_BA7-TUM_HLA_7_01_01-DDA-1h-R1.32733.32733.2.0.dta\n", + "CHARGE=2+\n", + "RTINSECONDS=0.6455220\n", + "PEPMASS=287.427959\n", + "103.34669 5304.0\n", + "104.66884 5639.7\n", + "402.705571 402.705571\n", + "END IONS\n", + "BEGIN IONS\n", + "TITLE=02445a_BA7-TUM_HLA_7_01_01-DDA-1h-R1.23669.23669.2.0.dta\n", + "CHARGE=2+\n", + "RTINSECONDS=0.6455220\n", + "PEPMASS=287.427959\n", + "END IONS\n", + "\"\"\")\n", + "\n", + "ms_file_dict = {\n", + " 'raw': copy.deepcopy(mgf),\n", + " 'raw1': copy.deepcopy(mgf),\n", + "}\n", + "\n", + "psmlabel_str = '''File_Name\tSequence\tModification\tCharge\tScan_No\tProteins\tQ-value\tTarget/Decoy\tFinal_Score\n", + "raw.31809.31809.2.0.dta\tPSTDLLMLK\t2,Phospho[S];7,Oxidation[M];\t2\t31809\tProt\t0\ttarget\t100\n", + "raw.23862.23862.2.0.dta\tHTAYSDFLSDK\t\t2\t23862\tProt\t0\ttarget\t100\n", + "raw.23431.23431.2.0.dta\tHTAYSDFLSDK\t\t2\t23431\tProt\t0\ttarget\t100\n", + "raw.32733.32733.2.0.dta\tHFALFSTDVTK\t\t2\t32733\tProt\t0\ttarget\t100\n", + "raw.23669.23669.2.0.dta\tHTAYSDFLSDK\t\t2\t23669\tProt\t0\ttarget\t100\n", + "raw1.31809.31809.2.0.dta\tPSTDLLMLK\t2,Phospho[S];7,Oxidation[M];\t2\t31809\tProt\t0\ttarget\t100\n", + "raw1.23862.23862.2.0.dta\tHTAYSDFLSDK\t\t2\t23862\tProt\t0\ttarget\t100\n", + "raw1.23431.23431.2.0.dta\tHTAYSDFLSDK\t\t2\t23431\tProt\t0\ttarget\t100\n", + "raw1.32733.32733.2.0.dta\tHFALFSTDVTK\t\t2\t32733\tProt\t0\ttarget\t100\n", + "'''\n", + "reader = pFindReader()\n", + "reader.import_file(io.StringIO(psmlabel_str))\n", + "psm_df = reader.psm_df\n", + "matching = PepSpecMatch()\n", + "matching.match_ms2_multi_raw(psm_df, ms_file_dict, 'mgf')\n", + "merrs = matching.matched_mz_err_df.values\n", + "#np.sum(matching.matched_intensity_df.values!=0,axis=1)\n", + "assert len(merrs[~np.isinf(merrs)])==6\n", + "assert np.count_nonzero(matching.matched_intensity_df.values)==6" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 2/2 [00:00<00:00, 174.90it/s]\n" + ] + } + ], + "source": [ + "#| hide\n", + "mgf_reader = ms_reader_provider.get_reader('mgf')\n", + "mgf_reader.import_raw(copy.deepcopy(mgf))\n", + "ms_file_dict = {'raw': mgf_reader}\n", + "mgf_reader1 = ms_reader_provider.get_reader('mgf')\n", + "mgf_reader1.import_raw(copy.deepcopy(mgf))\n", + "ms_file_dict['raw1'] = mgf_reader1\n", + "matching.match_ms2_multi_raw(psm_df, ms_file_dict, 'mgf')\n", + "merrs = matching.matched_mz_err_df.values\n", + "assert np.count_nonzero(matching.matched_intensity_df.values) == 6\n", + "assert len(merrs[~np.isinf(merrs)]) == 6" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1/1 [00:00<00:00, 111.73it/s]\n" + ] + } + ], + "source": [ + "#| hide\n", + "ms_file_dict = {\n", + " 'raw': copy.deepcopy(mgf),\n", + "}\n", + "reader = pFindReader()\n", + "reader.import_file(io.StringIO(psmlabel_str))\n", + "psm_df = reader.psm_df\n", + "psm_df = psm_df[~psm_df.raw_name.str.startswith('raw1')].copy()\n", + "matching = PepSpecMatch()\n", + "matching.match_ms2_multi_raw(psm_df, ms_file_dict, 'mgf')\n", + "matching.load_ms_data(copy.deepcopy(mgf), 'mgf')\n", + "df, frag_mz_df, frag_inten_df, frag_merr_df = matching.match_ms2_one_raw(\n", + " psm_df\n", + ")\n", + "assert (matching.fragment_mz_df==frag_mz_df).values.all()\n", + "assert (matching.matched_intensity_df==frag_inten_df).values.all()\n", + "assert (matching.matched_mz_err_df==frag_merr_df).values.all()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.8.3 ('base')", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/nbs_tests/match/test_dia_matching.ipynb b/nbs_tests/match/test_dia_matching.ipynb new file mode 100644 index 00000000..6e74716d --- /dev/null +++ b/nbs_tests/match/test_dia_matching.ipynb @@ -0,0 +1,846 @@ +{ + "cells": [ + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:root:WARNING: Temp mmap arrays are written to /var/folders/fh/hf8t3l1x02d42ggk3b304_rh0000gn/T/temp_mmap_37vmr95k. Cleanup of this folder is OS dependant, and might need to be triggered manually! Current space: 624,114,790,400\n", + "WARNING:root:WARNING: No Bruker libraries are available for this operating system. Mobility and m/z values need to be estimated. While this estimation often returns acceptable results with errors < 0.02 Th, huge errors (e.g. offsets of 6 Th) have already been observed for some samples!\n" + ] + } + ], + "execution_count": 2, + "source": "from peptdeep.match.psm_match import PepSpecMatch_DIA" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "psm_match = PepSpecMatch_DIA()\n", + "psm_match.min_frag_mz = 0\n", + "# psm_match.raw_data.save_hdf(\"/Users/wenfengzeng/data/orbi_dia/hla/20200317_QE_HFX2_LC3_DIA_RA957_R01.raw.hdf\")" + ] + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "from alphabase.psm_reader.dia_psm_reader import DiannReader\n", + "psm_df = DiannReader().import_file(\"/Users/wenfengzeng/data/orbi_dia/hla/20200317_QE_HFX2_LC3_DIA_RA957_R01.tsv\")\n", + "psm_df" + ] + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "ms_files = [\"/Users/wenfengzeng/data/orbi_dia/hla/20200317_QE_HFX2_LC3_DIA_RA957_R01.raw.hdf\"]\n", + "df,mz_df,inten_df,merr_df = psm_match.match_ms2_multi_raw(psm_df, ms_files)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
raw_namesequencechargertrt_startrt_stopmobilityproteinsuniprot_idsgenes...fdrspec_idxmodsmod_sitesnAArt_normprecursor_mzccsfrag_start_idxfrag_stop_idx
020200317_QE_HFX2_LC3_DIA_RA957_R01FPTSSPTL177.656677.441477.87150Q15696;Q15695...0.0015955501780.648554849.4352600.007
120200317_QE_HFX2_LC3_DIA_RA957_R01ANSAFVER146.319546.135246.53490P08238;Q58FF7;P07900;P07900-2...0.0000973262280.386840893.4475570.0714
220200317_QE_HFX2_LC3_DIA_RA957_R01ANVAEHQR223.864623.707924.02120Q12929;Q12929-2...0.0014921674580.199307462.7359400.01421
320200317_QE_HFX2_LC3_DIA_RA957_R01APAAGEEG119.426319.269619.58290Q9BYE7-3;Q9BYE7...0.0012191363180.162240701.3100600.02128
420200317_QE_HFX2_LC3_DIA_RA957_R01HRIVSISL267.902167.717768.02520Q5VY09...0.0007134802980.567089462.7849020.02835
..................................................................
6042120200317_QE_HFX2_LC3_DIA_RA957_R01TPSKQSNNKYAASSYLSLTPEQWK378.764178.641478.85620P0DOY2;P0CF74;P0DOY3...0.00009755810240.657804910.1240790.0534642534665
6042220200317_QE_HFX2_LC3_DIA_RA957_R01VDDTQFVRFDSDAASPRTEPRAPW388.949988.734889.10310P30490;P30685;P30491;P10319;P18463...0.00051663092240.742871921.7755620.0534665534688
6042320200317_QE_HFX2_LC3_DIA_RA957_R01VDDTQFVRFDSDAASPRGEPREPW386.270386.177986.42380P30504...0.00039961178240.720492926.4353170.0534688534711
6042420200317_QE_HFX2_LC3_DIA_RA957_R01ASGSGAQVGGPISSGSSASSVTVTR348.606748.482748.69920P02545;P02545-3;P02545-5;P02545-4...0.00071334246250.405942736.3681320.0534711534735
6042520200317_QE_HFX2_LC3_DIA_RA957_R01SGGGGGGSSGGRGSGGGSSGGSIGG218.241518.147518.30420P04264;CON__P04264...0.00009712800250.152345905.3889550.0534735534759
\n", + "

60426 rows × 22 columns

\n", + "
" + ], + "text/plain": [ + " raw_name sequence charge \\\n", + "0 20200317_QE_HFX2_LC3_DIA_RA957_R01 FPTSSPTL 1 \n", + "1 20200317_QE_HFX2_LC3_DIA_RA957_R01 ANSAFVER 1 \n", + "2 20200317_QE_HFX2_LC3_DIA_RA957_R01 ANVAEHQR 2 \n", + "3 20200317_QE_HFX2_LC3_DIA_RA957_R01 APAAGEEG 1 \n", + "4 20200317_QE_HFX2_LC3_DIA_RA957_R01 HRIVSISL 2 \n", + "... ... ... ... \n", + "60421 20200317_QE_HFX2_LC3_DIA_RA957_R01 TPSKQSNNKYAASSYLSLTPEQWK 3 \n", + "60422 20200317_QE_HFX2_LC3_DIA_RA957_R01 VDDTQFVRFDSDAASPRTEPRAPW 3 \n", + "60423 20200317_QE_HFX2_LC3_DIA_RA957_R01 VDDTQFVRFDSDAASPRGEPREPW 3 \n", + "60424 20200317_QE_HFX2_LC3_DIA_RA957_R01 ASGSGAQVGGPISSGSSASSVTVTR 3 \n", + "60425 20200317_QE_HFX2_LC3_DIA_RA957_R01 SGGGGGGSSGGRGSGGGSSGGSIGG 2 \n", + "\n", + " rt rt_start rt_stop mobility proteins \\\n", + "0 77.6566 77.4414 77.8715 0 \n", + "1 46.3195 46.1352 46.5349 0 \n", + "2 23.8646 23.7079 24.0212 0 \n", + "3 19.4263 19.2696 19.5829 0 \n", + "4 67.9021 67.7177 68.0252 0 \n", + "... ... ... ... ... ... \n", + "60421 78.7641 78.6414 78.8562 0 \n", + "60422 88.9499 88.7348 89.1031 0 \n", + "60423 86.2703 86.1779 86.4238 0 \n", + "60424 48.6067 48.4827 48.6992 0 \n", + "60425 18.2415 18.1475 18.3042 0 \n", + "\n", + " uniprot_ids genes ... fdr spec_idx \\\n", + "0 Q15696;Q15695 ... 0.001595 55017 \n", + "1 P08238;Q58FF7;P07900;P07900-2 ... 0.000097 32622 \n", + "2 Q12929;Q12929-2 ... 0.001492 16745 \n", + "3 Q9BYE7-3;Q9BYE7 ... 0.001219 13631 \n", + "4 Q5VY09 ... 0.000713 48029 \n", + "... ... ... ... ... ... \n", + "60421 P0DOY2;P0CF74;P0DOY3 ... 0.000097 55810 \n", + "60422 P30490;P30685;P30491;P10319;P18463 ... 0.000516 63092 \n", + "60423 P30504 ... 0.000399 61178 \n", + "60424 P02545;P02545-3;P02545-5;P02545-4 ... 0.000713 34246 \n", + "60425 P04264;CON__P04264 ... 0.000097 12800 \n", + "\n", + " mods mod_sites nAA rt_norm precursor_mz ccs frag_start_idx \\\n", + "0 8 0.648554 849.435260 0.0 0 \n", + "1 8 0.386840 893.447557 0.0 7 \n", + "2 8 0.199307 462.735940 0.0 14 \n", + "3 8 0.162240 701.310060 0.0 21 \n", + "4 8 0.567089 462.784902 0.0 28 \n", + "... ... ... .. ... ... ... ... \n", + "60421 24 0.657804 910.124079 0.0 534642 \n", + "60422 24 0.742871 921.775562 0.0 534665 \n", + "60423 24 0.720492 926.435317 0.0 534688 \n", + "60424 25 0.405942 736.368132 0.0 534711 \n", + "60425 25 0.152345 905.388955 0.0 534735 \n", + "\n", + " frag_stop_idx \n", + "0 7 \n", + "1 14 \n", + "2 21 \n", + "3 28 \n", + "4 35 \n", + "... ... \n", + "60421 534665 \n", + "60422 534688 \n", + "60423 534711 \n", + "60424 534735 \n", + "60425 534759 \n", + "\n", + "[60426 rows x 22 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
b_z1b_z2y_z1y_z2b_modloss_z1b_modloss_z2y_modloss_z1y_modloss_z2
00.0000000.00.0000000.00.00.00.00.0
1190300.5781250.037765.1835940.00.00.00.00.0
20.0000000.00.0000000.00.00.00.00.0
30.0000000.00.0000000.00.00.00.00.0
40.0000000.00.0000000.00.00.00.00.0
...........................
5347540.0000000.00.0000000.00.00.00.00.0
5347550.0000000.00.0000000.00.00.00.00.0
5347560.0000000.00.0000000.00.00.00.00.0
5347570.0000000.00.0000000.00.00.00.00.0
5347580.0000000.00.0000000.00.00.00.00.0
\n", + "

534759 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " b_z1 b_z2 y_z1 y_z2 b_modloss_z1 b_modloss_z2 \\\n", + "0 0.000000 0.0 0.000000 0.0 0.0 0.0 \n", + "1 190300.578125 0.0 37765.183594 0.0 0.0 0.0 \n", + "2 0.000000 0.0 0.000000 0.0 0.0 0.0 \n", + "3 0.000000 0.0 0.000000 0.0 0.0 0.0 \n", + "4 0.000000 0.0 0.000000 0.0 0.0 0.0 \n", + "... ... ... ... ... ... ... \n", + "534754 0.000000 0.0 0.000000 0.0 0.0 0.0 \n", + "534755 0.000000 0.0 0.000000 0.0 0.0 0.0 \n", + "534756 0.000000 0.0 0.000000 0.0 0.0 0.0 \n", + "534757 0.000000 0.0 0.000000 0.0 0.0 0.0 \n", + "534758 0.000000 0.0 0.000000 0.0 0.0 0.0 \n", + "\n", + " y_modloss_z1 y_modloss_z2 \n", + "0 0.0 0.0 \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 \n", + "... ... ... \n", + "534754 0.0 0.0 \n", + "534755 0.0 0.0 \n", + "534756 0.0 0.0 \n", + "534757 0.0 0.0 \n", + "534758 0.0 0.0 \n", + "\n", + "[534759 rows x 8 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inten_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PCCCOSSASPC
count20142.00000020142.00000020142.00000020142.000000
mean0.7931840.8017610.6626810.786549
std0.2595750.2522180.2517220.137454
min-0.0500930.0000000.0000000.026527
25%0.7223880.7366390.5302220.704215
50%0.9005110.9057500.7244000.805559
75%0.9724940.9738260.8558590.894600
max1.0000031.0000021.0000001.000000
>0.900.5108070.5227220.1674940.260136
>0.750.7268560.7396320.4610600.635836
\n", + "
" + ], + "text/plain": [ + " PCC COS SA SPC\n", + "count 20142.000000 20142.000000 20142.000000 20142.000000\n", + "mean 0.793184 0.801761 0.662681 0.786549\n", + "std 0.259575 0.252218 0.251722 0.137454\n", + "min -0.050093 0.000000 0.000000 0.026527\n", + "25% 0.722388 0.736639 0.530222 0.704215\n", + "50% 0.900511 0.905750 0.724400 0.805559\n", + "75% 0.972494 0.973826 0.855859 0.894600\n", + "max 1.000003 1.000002 1.000000 1.000000\n", + ">0.90 0.510807 0.522722 0.167494 0.260136\n", + ">0.75 0.726856 0.739632 0.461060 0.635836" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from peptdeep.model.ms2 import calc_ms2_similarity\n", + "metrics_list = []\n", + "frag_len = len(inten_df)//psm_match.max_spec_per_query\n", + "_df = df.iloc[:len(df)//psm_match.max_spec_per_query].copy()\n", + "for i in range(psm_match.max_spec_per_query):\n", + " for j in range(i+1,psm_match.max_spec_per_query):\n", + " _df,metrics_df = calc_ms2_similarity(\n", + " _df,\n", + " inten_df[i*frag_len:(i+1)*frag_len],\n", + " inten_df[j*frag_len:(j+1)*frag_len],\n", + " )\n", + " metrics_list.append(metrics_df)\n", + "sum(metrics_list)/len(metrics_list)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/peptdeep/mass_spec/match.py b/peptdeep/mass_spec/match.py index 7e9f8041..eff9cd85 100644 --- a/peptdeep/mass_spec/match.py +++ b/peptdeep/mass_spec/match.py @@ -213,7 +213,10 @@ def match_one_raw_with_numba( class PepSpecMatch(object): - """Main entry for peptide-spectrum matching""" + """Main entry for peptide-spectrum matching. + + TODO: figure out relation with `peptdeep.match.psm_match.PepSpecMatch`. + """ def __init__( self, diff --git a/peptdeep/match/__init__.py b/peptdeep/match/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/peptdeep/match/normal_dia.py b/peptdeep/match/normal_dia.py new file mode 100644 index 00000000..cce2ebf0 --- /dev/null +++ b/peptdeep/match/normal_dia.py @@ -0,0 +1,50 @@ +"""DIA grouping for normal DIA data, which is not acquired with ion mobility separation. Moved from alpharaw.""" + +import typing +from collections import defaultdict + +import numpy as np + +from alpharaw.ms_data_base import MSData_Base + + +class NormalDIAGrouper: + def __init__(self, ms_data: MSData_Base): + self.ms_data = ms_data + self.ms_data.spectrum_df["dia_group"] = ms_data.spectrum_df.precursor_mz.astype( + int + ) + + self.dia_group_dfs = self.ms_data.spectrum_df.groupby("dia_group") + self.dia_isolation_dict = {} + for dia_group, df in self.dia_group_dfs: + if dia_group == -1: + continue + self.dia_isolation_dict[dia_group] = ( + df.isolation_lower_mz.values[0], + df.isolation_upper_mz.values[0], + ) + self.dia_groups = np.sort(list(self.dia_isolation_dict.keys())) + + def assign_dia_groups( + self, precursor_mzs + ) -> typing.DefaultDict[typing.List, typing.List]: + dia_precursor_groups = defaultdict(list) + for i, mz in enumerate(precursor_mzs): + i_group = np.searchsorted(self.dia_groups, int(mz)) + if i_group == 0: + i_group = 1 + elif i_group == len(self.dia_groups): + i_group -= 1 + dia_group = self.dia_groups[i_group] + if dia_group in self.dia_isolation_dict: + isolation_lower, isolation_upper = self.dia_isolation_dict[dia_group] + if mz >= isolation_lower and mz <= isolation_upper: + dia_precursor_groups[dia_group].append(i) + continue + dia_group = self.dia_groups[i_group - 1] + if dia_group in self.dia_isolation_dict: + isolation_lower, isolation_upper = self.dia_isolation_dict[dia_group] + if mz >= isolation_lower and mz <= isolation_upper: + dia_precursor_groups[dia_group].append(i) + return dia_precursor_groups diff --git a/peptdeep/match/psm_match.py b/peptdeep/match/psm_match.py new file mode 100644 index 00000000..e34c1d09 --- /dev/null +++ b/peptdeep/match/psm_match.py @@ -0,0 +1,615 @@ +"""This module provides the `PepSpecMatch` class to match fragment ions of PSMs. Moved from alpharaw.""" + +from typing import Tuple, Union + +import numba +import numpy as np +import pandas as pd +import tqdm +from alphabase.peptide.fragment import ( + create_fragment_mz_dataframe, + get_charged_frag_types, +) + +from peptdeep.match.utils import load_ms_data, match_one_raw_with_numba +from peptdeep.match.normal_dia import NormalDIAGrouper +from alpharaw import register_all_readers +from alpharaw.match.match_utils import ( + match_closest_peaks, + match_highest_peaks, +) +from alpharaw.match.spec_finder import find_dia_spec_idxes_same_window +from alpharaw.ms_data_base import ( + PEAK_INTENSITY_DTYPE, + PEAK_MZ_DTYPE, + MSData_Base, +) +from alpharaw.utils.ms_path_utils import parse_ms_files_to_dict + +register_all_readers() # TODO remove this import side effect + + +class PepSpecMatch: + """ + Extract fragment ions from MS2 data. + The extracted information can be used for visualization of peak annotation or + PeptDeep transfer learnining for the MS2 model. + + TODO: figure out relation with `peptdeep.mass_spec.match.PepSpecMatch`. + """ + + match_closest: bool = True + use_ppm: bool = True + #: matching mass tolerance + tolerance: float = 20.0 + ms_loader_thread_num: int = 4 + + def __init__( + self, + charged_frag_types: list = None, + match_closest: bool = True, + use_ppm: bool = True, + tol_value: float = 20.0, + ): + """ + Parameters + ---------- + charged_frag_types : list, optional + fragment types with charge states, + e.g. ['b_z1', 'y_z2', 'b_modloss_z1', 'y_H2O_z2']. + Defaults to `get_charged_frag_types(['b','y','b_modloss','y_modloss'], 2)`. + + match_closest : bool, optional + if True, match the closest peak for a m/z; + if False, matched the higest peak for a m/z in the tolerance range. + By default True. + + use_ppm : bool, optional + If use ppm other wise Da, by default True. + + tol_value : float, optional + Matching tolerance value (ppm or Da based on `use_ppm`) + for peak annotation, by default 20.0 + """ + self.charged_frag_types = ( + get_charged_frag_types(["b", "y", "b_modloss", "y_modloss"], 2) + if charged_frag_types is None + else charged_frag_types + ) + self.match_closest = match_closest + self.use_ppm = use_ppm + self.tolerance = tol_value + + def get_fragment_mz_df(self) -> pd.DataFrame: + """ + Call :func:`alphabase.peptide.fragment.create_fragment_mz_dataframe` + for :attr:`PepSpecMatch.psm_df` and :attr:`PepSpecMatch.charged_frag_types`. + + + Returns + ------- + DataFrame + The fragment m/z dataframe in alphabase format. + """ + return create_fragment_mz_dataframe( + self.psm_df, + self.charged_frag_types, + dtype=PEAK_MZ_DTYPE, + ) + + def _add_missing_columns_to_psm_df( + self, psm_df: pd.DataFrame, raw_data: MSData_Base = None + ): + """ + Add missing "rt", "nce", "rt_norm", ("mobility") columns to `psm_df` inplace if missing. + + Parameters + ---------- + psm_df : pd.DataFrame + psm dataframe to be processed. + raw_data : MSData_Base, optional + The `MSData_Base`. If None, `self.raw_data`. by default None. + + Returns + ------- + DataFrame + The original `psm_df` with missing columns added. + """ + if raw_data is None: + raw_data = self.raw_data + add_spec_info_list = [] + if "rt" not in psm_df.columns: + add_spec_info_list.append("rt") + + if "nce" in raw_data.spectrum_df.columns and "nce" not in psm_df.columns: + add_spec_info_list.append("nce") + + if ( + "mobility" not in psm_df.columns + and "mobility" in raw_data.spectrum_df.columns + ): + add_spec_info_list.append("mobility") + + if len(add_spec_info_list) > 0: + # pfind does not report RT in the result file + psm_df = ( + psm_df.reset_index() + .merge( + raw_data.spectrum_df[["spec_idx"] + add_spec_info_list], + how="left", + on="spec_idx", + ) + .set_index("index") + ) + + if "rt" in add_spec_info_list: + psm_df["rt_norm"] = psm_df.rt / raw_data.spectrum_df.rt.max() + # if 'rt_sec' not in psm_df.columns: + # psm_df['rt_sec'] = psm_df.rt*60 + return psm_df + + def _prepare_matching_dfs(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """ + Prepare empty `fragment_mz_df`, `matched_intensity_df`, + and `matched_mz_err_df` dataframes to extract peak matching information + for `self.psm_df`. These three dataframes will be only used internally + in :class:`PepSpecMatch` objects. + + Returns + ------- + Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame] + pd.DataFrame: fragment mz dataframe. + + pd.DataFrame: intensity dataframe to match. + + pd.DataFrame: mz error dataframe to match. + """ + fragment_mz_df = self.get_fragment_mz_df() + + matched_intensity_df = pd.DataFrame( + np.zeros_like(fragment_mz_df.values, dtype=PEAK_INTENSITY_DTYPE), + columns=fragment_mz_df.columns, + ) + + matched_mz_err_df = pd.DataFrame( + np.full_like(fragment_mz_df.values, np.inf, dtype=PEAK_MZ_DTYPE), + columns=fragment_mz_df.columns, + ) + return (fragment_mz_df, matched_intensity_df, matched_mz_err_df) + + def load_ms_data( + self, + ms_file: Union[str, MSData_Base], + ms_file_type: str = "alpharaw_hdf", + process_count: int = 8, + **kwargs, + ): + """Load MS file to set `self.raw_data`. + + Parameters + ---------- + ms_file : str | MSData_Base + Absolute or relative path of the ms2 file. + + ms_file_type : str, optional + ms2 file type, could be + ["alpharaw_hdf","thermo","sciex","alphapept_hdf","mgf"]. + Default to 'alpharaw_hdf'. + """ + self.raw_data = load_ms_data(ms_file, ms_file_type, process_count=process_count) + + def get_peaks(self, spec_idx: int, **kwargs): + return self.raw_data.get_peaks(spec_idx) + + def _match_one_psm( + self, + peak_mzs: np.ndarray, + peak_intensities: np.ndarray, + fragment_mz_df: pd.DataFrame, + matched_intensity_df: pd.DataFrame, + matched_mz_err_df: pd.DataFrame, + frag_start_idx: int, + frag_stop_idx: int, + ): + """ + Match fragments of one precursor (located by `frag_start_idx` and `frag_stop_idx`) + against the corresponding `peak_mzs`. + + Parameters + ---------- + peak_mzs : np.ndarray + Peak m/z values to be matched. + peak_intensities : np.ndarray + Peak intensities to be matched. + fragment_mz_df : pd.DataFrame + fragment m/z dataframe to be matched. + matched_intensity_df : pd.DataFrame + The dataframe to store matched intensity values. + matched_mz_err_df : pd.DataFrame + The dataframe to store matched mz error values. + frag_start_idx : int + fragment start index of the given PSM. + frag_stop_idx : int + fragment stop index of the given PSM. + """ + if len(peak_mzs) == 0: + return + + peak_mzs = peak_mzs.astype(PEAK_MZ_DTYPE) + + frag_mzs = fragment_mz_df.values[frag_start_idx:frag_stop_idx, :] + + if self.use_ppm: + mz_tols = frag_mzs * self.tolerance * 1e-6 + else: + mz_tols = np.full_like(frag_mzs, self.tolerance) + + if self.match_closest: + matched_idxes = match_closest_peaks( + peak_mzs, peak_intensities, frag_mzs, mz_tols + ) + else: + matched_idxes = match_highest_peaks( + peak_mzs, + peak_intensities, + frag_mzs, + mz_tols, + ) + + matched_intens = peak_intensities[matched_idxes] + matched_intens[matched_idxes == -1] = 0 + + matched_mz_errs = np.abs(peak_mzs[matched_idxes] - frag_mzs) + matched_mz_errs[matched_idxes == -1] = np.inf + + matched_intensity_df.values[frag_start_idx:frag_stop_idx, :] = matched_intens + + matched_mz_err_df.values[frag_start_idx:frag_stop_idx, :] = matched_mz_errs + + def match_ms2_one_raw( + self, + psm_df_one_raw: pd.DataFrame, + verbose: bool = False, + ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """ + Matching psm_df_one_raw against self.raw_data + after `self.load_ms_data()` + + Parameters + ---------- + psm_df_one_raw : pd.DataFrame + psm dataframe + that contains only one raw file + + Returns + ------- + Tuple: + pd.DataFrame: psm dataframe with fragment index information. + + pd.DataFrame: fragment mz dataframe. + + pd.DataFrame: matched intensity dataframe. + + pd.DataFrame: matched mass error dataframe. + np.inf if a fragment is not matched. + """ + self.psm_df = psm_df_one_raw + + psm_df_one_raw = self._add_missing_columns_to_psm_df( + psm_df_one_raw, self.raw_data + ) + + ( + fragment_mz_df, + matched_intensity_df, + matched_mz_err_df, + ) = self._prepare_matching_dfs() + + psm_iters = psm_df_one_raw[ + ["spec_idx", "frag_start_idx", "frag_stop_idx"] + ].values + if verbose: + psm_iters = tqdm.tqdm(psm_iters) + + for spec_idx, frag_start_idx, frag_stop_idx in psm_iters: + (spec_mzs, spec_intens) = self.get_peaks(spec_idx) + + self._match_one_psm( + spec_mzs, + spec_intens, + fragment_mz_df, + matched_intensity_df, + matched_mz_err_df, + frag_start_idx, + frag_stop_idx, + ) + + return (psm_df_one_raw, fragment_mz_df, matched_intensity_df, matched_mz_err_df) + + def _match_ms2_one_raw_numba(self, raw_name: str, psm_df_one_raw: pd.DataFrame): + if raw_name in self._ms_file_dict: + raw_data = load_ms_data( + self._ms_file_dict[raw_name], + self._ms_file_type, + process_count=self.ms_loader_thread_num, + ) + if raw_data is None: + return + + psm_df_one_raw = self._add_missing_columns_to_psm_df( + psm_df_one_raw, raw_data + ) + + if self.use_ppm: + all_frag_mz_tols = self.fragment_mz_df.values * self.tolerance * 1e-6 + else: + all_frag_mz_tols = np.full_like( + self.fragment_mz_df.values, self.tolerance + ) + + match_one_raw_with_numba( + psm_df_one_raw.spec_idx.values, + psm_df_one_raw.frag_start_idx.values, + psm_df_one_raw.frag_stop_idx.values, + self.fragment_mz_df.values, + all_frag_mz_tols, + raw_data.peak_df.mz.values, + raw_data.peak_df.intensity.values, + raw_data.spectrum_df.peak_start_idx.values, + raw_data.spectrum_df.peak_stop_idx.values, + self.matched_intensity_df.values, + self.matched_mz_err_df.values, + self.match_closest, + ) + else: + print(f"`{raw_name}` is not found in ms_file_dict.") + return + return psm_df_one_raw + + def match_ms2_multi_raw( + self, + psm_df: pd.DataFrame, + ms_files: Union[dict, list], + ms_file_type: str = "alpharaw_hdf", + process_num: int = 1, + ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """ + Matching PSM dataframe against the ms2 files in ms_files + This method will store matched values as attributes: + - self.psm_df + - self.fragment_mz_df + - self.matched_intensity_df + - self.matched_mz_err_df + + Parameters + ---------- + psm_df : pd.DataFrame + PSM dataframe + + ms_files : dict | list + if dict: {raw_name: ms2 path} + if list: [ms2 path1, ms2 path2] + + ms_file_type : str, optional + Could be 'alpharaw_hdf', 'mgf' or 'thermo', 'sciex', 'alphapept_hdf'. + Defaults to 'alphapept'. + + Returns + ------- + Tuple: + pd.DataFrame: psm dataframe with fragment index information. + + pd.DataFrame: fragment mz dataframe. + + pd.DataFrame: matched intensity dataframe. + + pd.DataFrame: matched mass error dataframe. + np.inf if a fragment is not matched. + + """ + self.psm_df = psm_df + + ( + self.fragment_mz_df, + self.matched_intensity_df, + self.matched_mz_err_df, + ) = self._prepare_matching_dfs() + + if isinstance(ms_files, dict): + self._ms_file_dict = ms_files + else: + self._ms_file_dict = parse_ms_files_to_dict(ms_files) + + self._ms_file_type = ms_file_type + + self.ms_loader_thread_num = process_num + # if process_num <= 1 or len(self._ms_file_dict) <= 1: + psm_df_list = [] + for raw_name, df_group in tqdm.tqdm(self.psm_df.groupby("raw_name")): + _df = self._match_ms2_one_raw_numba(raw_name, df_group) + if _df is not None: + psm_df_list.append(_df) + # else: + # with mp.get_context("spawn").Pool(processes=process_num) as p: + # df_groupby = self.psm_df.groupby('raw_name') + # def gen_group_df(df_groupby): + # for raw_name, df_group in df_groupby: + # yield (raw_name, df_group) + # process_bar( + # p.imap_unordered( + # self._match_ms2_one_raw_numba, + # gen_group_df(df_groupby) + # ), df_groupby.ngroups + # ) + + self.psm_df = pd.concat(psm_df_list, ignore_index=True) + return ( + self.psm_df, + self.fragment_mz_df, + self.matched_intensity_df, + self.matched_mz_err_df, + ) + + +class PepSpecMatch_DIA(PepSpecMatch): + """ + Peak annotation for DIA data. + """ + + max_spec_per_query: int = 3 + min_frag_mz: float = 200.0 + + def _add_missing_columns_to_psm_df(self, psm_df: pd.DataFrame, raw_data=None): + # DIA results do not have spec_idx/scan_num in psm_df, nothing to merge + return psm_df + + def _prepare_matching_dfs(self): + fragment_mz_df = self.get_fragment_mz_df() + fragment_mz_df = pd.concat( + [fragment_mz_df] * self.max_spec_per_query, ignore_index=True + ) + if self.use_ppm: + self.all_frag_mz_tols = fragment_mz_df.values * self.tolerance * 1e-6 + else: + self.all_frag_mz_tols = np.full_like(fragment_mz_df.values, self.tolerance) + + psm_df_list = [] + len_frags = len(fragment_mz_df) // self.max_spec_per_query + for i in range(self.max_spec_per_query): + psm_df = self.psm_df.copy() + psm_df["frag_start_idx"] = psm_df.frag_start_idx + i * len_frags + psm_df["frag_stop_idx"] = psm_df.frag_stop_idx + i * len_frags + psm_df_list.append(psm_df) + self.psm_df = pd.concat(psm_df_list, ignore_index=True) + + matched_intensity_df = pd.DataFrame( + np.zeros_like(fragment_mz_df.values, dtype=PEAK_INTENSITY_DTYPE), + columns=fragment_mz_df.columns, + ) + + matched_mz_err_df = pd.DataFrame( + np.zeros_like(fragment_mz_df.values, dtype=PEAK_MZ_DTYPE), + columns=fragment_mz_df.columns, + ) + + return (fragment_mz_df, matched_intensity_df, matched_mz_err_df) + + def _match_ms2_one_raw_numba( + self, raw_name: str, psm_df_one_raw: pd.DataFrame + ) -> pd.DataFrame: + """ + Internal method to extract peak information with numba as backend. + + Parameters + ---------- + raw_name : str + The raw name of the raw file. `psm_df_one_raw` dataframe should also + contain the same raw name in `raw_name` column. + psm_df_one_raw : pd.DataFrame + The dataframe for PSMs. + + Returns + ------- + pd.DataFrame + `psm_df_one_raw` + """ + psm_df_one_raw = psm_df_one_raw.reset_index(drop=True) + + if raw_name in self._ms_file_dict: + raw_data = load_ms_data( + self._ms_file_dict[raw_name], + self._ms_file_type, + process_count=self.ms_loader_thread_num, + ) + if raw_data is None: + return + + psm_origin_len = len(psm_df_one_raw) // self.max_spec_per_query + + grouper = NormalDIAGrouper(raw_data) + + psm_groups = grouper.assign_dia_groups( + psm_df_one_raw.precursor_mz.values[:psm_origin_len] + ) + + all_spec_idxes = np.full(len(psm_df_one_raw), -1, dtype=np.int32) + + for dia_group, group_df in grouper.dia_group_dfs: + psm_idxes = psm_groups[dia_group] + if len(psm_idxes) == 0: + continue + psm_idxes = np.array(psm_idxes, dtype=np.int32) + spec_idxes = find_dia_spec_idxes_same_window( + group_df.rt.values, + psm_df_one_raw.rt.values[psm_idxes], + max_spec_per_query=self.max_spec_per_query, + ) + for i in range(spec_idxes.shape[-1]): + all_spec_idxes[psm_idxes + psm_origin_len * i] = spec_idxes[:, i] + + match_one_raw_with_numba( + all_spec_idxes, + psm_df_one_raw.frag_start_idx.values, + psm_df_one_raw.frag_stop_idx.values, + self.fragment_mz_df.values, + self.all_frag_mz_tols, + raw_data.peak_df.mz.values, + raw_data.peak_df.intensity.values, + group_df.peak_start_idx.values, + group_df.peak_stop_idx.values, + self.matched_intensity_df.values, + self.matched_mz_err_df.values, + self.match_closest, + ) + else: + print(f"`{raw_name}` is not found in ms_file_dict.") + return + return psm_df_one_raw + + def match_ms2_multi_raw( + self, + psm_df: pd.DataFrame, + ms_files: Tuple[dict, list], + ms_file_type: str = "alpharaw_hdf", + process_num: int = 8, + ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """ + Match peaks for the given `psm_df` against the corresponding MS spectrum files. + + Parameters + ---------- + psm_df : pd.DataFrame + Peptide-spectrum matches in alphabase dataframe format. + ms_files : Tuple[dict, list] + The absolute or relative paths of MS files. + if the type is `dict`, the format will be + `{'raw_name1': 'raw_name1.raw', ...}` if `ms_file_type` is `thermo_raw`. + ms_file_type : str, optional + MS file type that is already registered in + :obj:`alpharaw.ms_data_base.ms_reader_provider`. + By default "alpharaw_hdf". + process_num : int, optional + Match peaks by using multiprocessing, by default 8 + + Returns + ------- + Tuple + pd.DataFrame: the `psm_df`. + + pd.DataFrame: fragment m/z dataframe in alphabase format. + + pd.DataFrame: the matched fragment intensity dataframe in alphabase format. + + pd.DataFrame: the matched mass error in the same dataframe format. + """ + if isinstance(ms_files, list): + ms_files = parse_ms_files_to_dict(ms_files) + psm_df = psm_df[psm_df.raw_name.isin(ms_files)].reset_index(drop=True) + super().match_ms2_multi_raw(psm_df, ms_files, ms_file_type, process_num) + + return ( + self.psm_df, + self.fragment_mz_df, + self.matched_intensity_df, + self.matched_mz_err_df, + ) diff --git a/peptdeep/match/utils.py b/peptdeep/match/utils.py new file mode 100644 index 00000000..4ed29727 --- /dev/null +++ b/peptdeep/match/utils.py @@ -0,0 +1,138 @@ +"""Utilities for psm matching. Moved from alpharaw.""" + +from typing import Union + +import numba +import numpy as np + +from alpharaw.match.match_utils import match_closest_peaks, match_highest_peaks +from alpharaw.ms_data_base import MSData_Base, ms_reader_provider + + +@numba.jit(nogil=True) +def match_one_raw_with_numba( + spec_idxes: np.ndarray, + frag_start_idxes: np.ndarray, + frag_stop_idxes: np.ndarray, + all_frag_mzs: np.ndarray, + all_frag_mz_tols: np.ndarray, + all_spec_mzs: np.ndarray, + all_spec_intensities: np.ndarray, + peak_start_idxes: np.ndarray, + peak_stop_idxes: np.ndarray, + matched_intensities: np.ndarray, + matched_mz_errs: np.ndarray, + match_closest: bool = True, +) -> None: + """ + Internel function to match fragment mz values to spectrum mz values. + Matched_mz_errs[i] = np.inf if no peaks are matched. + + Results will saved in place of matched_intensities + and matched_mz_errs. + """ + for spec_idx, frag_start, frag_stop in zip( + spec_idxes, frag_start_idxes, frag_stop_idxes + ): + if spec_idx == -1: + continue + peak_start = peak_start_idxes[spec_idx] + peak_stop = peak_stop_idxes[spec_idx] + if peak_stop == peak_start: + continue + spec_mzs = all_spec_mzs[peak_start:peak_stop] + spec_intens = all_spec_intensities[peak_start:peak_stop] + + frag_mzs = all_frag_mzs[frag_start:frag_stop, :].copy() + frag_mz_tols = all_frag_mz_tols[frag_start:frag_stop, :].copy() + + if match_closest: + matched_idxes = match_closest_peaks( + spec_mzs, spec_intens, frag_mzs, frag_mz_tols + ).reshape(-1) + else: + matched_idxes = match_highest_peaks( + spec_mzs, spec_intens, frag_mzs, frag_mz_tols + ).reshape(-1) + + matched_intens = spec_intens[matched_idxes] + matched_intens[matched_idxes == -1] = 0 + + matched_mass_errs = np.abs( + spec_mzs[matched_idxes.reshape(-1)] - frag_mzs.reshape(-1) + ) + matched_mass_errs[matched_idxes == -1] = np.inf + + matched_intensities[frag_start:frag_stop, :] = matched_intens.reshape( + frag_mzs.shape + ) + + matched_mz_errs[frag_start:frag_stop, :] = matched_mass_errs.reshape( + frag_mzs.shape + ) + + +def load_ms_data( + ms_file: Union[str, MSData_Base], + ms_file_type: str = "alpharaw_hdf", + process_count: int = 8, +) -> MSData_Base: + """ + Load MS file and get `MSData_Base` object. + + Parameters + ---------- + ms_file : str | MSData_Base + ms2 file path + + ms_file_type : str, optional + ms2 file type, can be + ["alpharaw_hdf", "thermo", "sciex", "alphapept_hdf", "mgf"]. + Default to 'alpharaw_hdf'. + + Returns + ------- + MSData_Base: + Instance of sub-class of `MSData_Base`. + """ + if isinstance(ms_file, MSData_Base): + return ms_file + else: + raw_data = ms_reader_provider.get_reader( + ms_file_type, process_count=process_count + ) + if raw_data is None: + print( + f"Raw file type '{ms_file_type}' is not registered in 'ms_reader_provider'" + ) + return + raw_data.import_raw(ms_file) + return raw_data + + +@numba.njit +def get_ion_count_scores( + frag_mz_values: np.ndarray, + matched_intens: np.ndarray, + frag_start_idxes: np.ndarray, + frag_stop_idxes: np.ndarray, + min_mz: float = 200, +): + """ + TODO Deprecated + """ + scores = [] + for i in range(len(frag_start_idxes)): + scores.append( + np.count_nonzero( + matched_intens[frag_start_idxes[i] : frag_stop_idxes[i], :] + .copy() + .reshape(-1)[ + frag_mz_values[frag_start_idxes[i] : frag_stop_idxes[i], :] + .copy() + .reshape(-1) + >= min_mz + ] + ) + ) + return np.array(scores, np.int32) diff --git a/peptdeep/pipeline_api.py b/peptdeep/pipeline_api.py index 03359d8e..8812d4e2 100644 --- a/peptdeep/pipeline_api.py +++ b/peptdeep/pipeline_api.py @@ -23,17 +23,14 @@ from peptdeep.pretrained_models import ModelManager -# from peptdeep.rescore.feature_extractor import match_one_raw -from alpharaw.match.psm_match import ( - PepSpecMatch, - PepSpecMatch_DIA, - parse_ms_files_to_dict, - get_ion_count_scores, -) from peptdeep.model.ms2 import calc_ms2_similarity import peptdeep.model.rt as rt_module +from peptdeep.match.psm_match import PepSpecMatch_DIA, PepSpecMatch +from peptdeep.match.utils import get_ion_count_scores +from alpharaw.utils.ms_path_utils import parse_ms_files_to_dict + DIA_max_spec_per_query = 3 DIA_min_ion_count = 6 DIA_min_frag_mz = 200.0 @@ -108,10 +105,10 @@ def get_median_pccs_for_dia_psms( ----- The `psm_df` contains `max_spec_per_query` copies per peptide, each matched against a different MS2 scan. The PSMs need to be sorted by spec index before processing because - `alpharaw.match.psm_match.PepSpecMatch.match_ms2_multi_raw` + `peptdeep.match.psm_match.PepSpecMatch.match_ms2_multi_raw` orders them by `raw_name` when multiple MS files are used. - See also (alpharaw.match.psm_match): + See also (peptdeep.match.psm_match): - `PepSpecMatch_DIA._prepare_matching_dfs`: creates replicated PSM/fragment structure - `PepSpecMatch_DIA._match_ms2_one_raw_numba`: finds nearby MS2 scans and extracts fragments - `PepSpecMatch.match_ms2_multi_raw`: processes multiple MS files (reorders by raw_name) diff --git a/peptdeep/utils/__init__.py b/peptdeep/utils/__init__.py index 3e98499e..14a7e50c 100644 --- a/peptdeep/utils/__init__.py +++ b/peptdeep/utils/__init__.py @@ -12,7 +12,6 @@ import numpy as np -# from alphatims def process_bar(iterator, len_iter): with tqdm.tqdm(total=len_iter) as bar: i = 0 diff --git a/peptdeep/utils/logger.py b/peptdeep/utils/logger.py index 3e90ee19..91cf48cc 100644 --- a/peptdeep/utils/logger.py +++ b/peptdeep/utils/logger.py @@ -38,7 +38,7 @@ def set_logger( The file name to where the log is written. Folders are automatically created if needed. This is relative to the current path. When an empty string is provided, - a log is written to the AlphaTims "logs" folder with the name + a log is written to the PeptDeep "logs" folder with the name "log_yymmddhhmmss" (reversed timestamp year to seconds). If None, no log file is saved. Default is "". @@ -158,7 +158,7 @@ def show_python_info() -> None: This is done in the following format: ``` - [timestamp]> Python information: - - [timestamp]> alphatims - [current_version] + - [timestamp]> peptdeep - [current_version] - [timestamp]> [required package] - [current_version] - ... - [timestamp]> [required package] - [current_version] diff --git a/tests/run_tests.sh b/tests/run_tests.sh index 6efaf9e7..14afcb6d 100755 --- a/tests/run_tests.sh +++ b/tests/run_tests.sh @@ -1,4 +1,6 @@ -TEST_NBS=$(find ../nbs_tests -name "*.ipynb") +# ignore notebooks that requires local data +TEST_NBS=$(find ../nbs_tests -name "*.ipynb" | grep -v "test_dia_matching.ipynb") + TUTORIAL_NBS=$(find ../docs/tutorials -name "*.ipynb") UNIT_TESTS=$(find ../tests/unit -name "test_*.py") ALL_TESTS=$(echo $TEST_NBS$'\n'$TUTORIAL_NBS$'\n'$UNIT_TESTS)