diff --git a/docs/build_docs.sh b/docs/build_docs.sh
index ec9c83ca..0aefd331 100644
--- a/docs/build_docs.sh
+++ b/docs/build_docs.sh
@@ -1,11 +1,9 @@
rm -rf build
conda env remove -n alphabasedocs
conda create -n alphabasedocs python=3.9 -y
-# conda create -n alphatimsinstaller python=3.9
+
conda activate alphabasedocs
-# call conda install git -y
-# call pip install 'git+https://github.com/MannLabs/alphatims.git#egg=alphatims[gui]' --use-feature=2020-resolver
-# brew install freetype
+
pip install '../.[development]'
make html
conda deactivate
diff --git a/nbs_tests/match/psm_match.ipynb b/nbs_tests/match/psm_match.ipynb
new file mode 100644
index 00000000..10599c6d
--- /dev/null
+++ b/nbs_tests/match/psm_match.ipynb
@@ -0,0 +1,333 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#---#| default_exp match.psm_match"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Match"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Peak matching functionalities"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "\n",
+ "from peptdeep.match.psm_match import PepSpecMatch\n",
+ "from alpharaw import register_all_readers\n",
+ "from alpharaw.ms_data_base import ms_reader_provider\n",
+ "\n",
+ "register_all_readers()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#| hide\n",
+ "import io\n",
+ "import copy\n",
+ "\n",
+ "from alphabase.psm_reader.pfind_reader import pFindReader\n",
+ "from alphabase.peptide.fragment import create_fragment_mz_dataframe_by_sort_precursor\n",
+ "from alpharaw.legacy_msdata.mgf import MGFReader"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 2/2 [00:01<00:00, 1.41it/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "#| hide\n",
+ "#unittest\n",
+ "mgf = io.StringIO(\"\"\"\n",
+ "BEGIN IONS\n",
+ "TITLE=02445a_BA7-TUM_HLA_7_01_01-DDA-1h-R1.31809.31809.3.0.dta\n",
+ "CHARGE=3+\n",
+ "RTINSECONDS=0.5418930\n",
+ "PEPMASS=272.276336\n",
+ "103.92207 5457.3\n",
+ "104.20045 5051.4\n",
+ "108.70090 5891.7\n",
+ "113.94175 6442.6\n",
+ "116.92975 40506.3\n",
+ "116.93716 8945.5\n",
+ "128.37773 6427.8\n",
+ "131.95308 288352.6\n",
+ "133.93259 7344.6\n",
+ "138.44611 7326.1\n",
+ "139.00072 41556.8\n",
+ "140.00319 16738.8\n",
+ "140.99719 9493.8\n",
+ "145.93156 10209.3\n",
+ "145.94897 10497.8\n",
+ "147.94559 8206.3\n",
+ "147.96396 30552.8\n",
+ "148.95543 14654.7\n",
+ "149.96338 234207.8\n",
+ "150.95096 8306.0\n",
+ "157.01089 84638.9\n",
+ "158.01357 27925.7\n",
+ "159.00627 16084.7\n",
+ "163.94281 24751.1\n",
+ "163.95915 32203.3\n",
+ "165.95605 44458.0\n",
+ "165.97186 11530.2\n",
+ "166.99500 26432.2\n",
+ "167.97302 9216.7\n",
+ "181.95230 13858.8\n",
+ "191.95448 66152.7\n",
+ "192.95538 8408.9\n",
+ "193.07185 9092.8\n",
+ "193.95313 660574.9\n",
+ "194.95674 23452.8\n",
+ "194.99008 143940.9\n",
+ "200.00568 19510.8\n",
+ "200.99942 23678.7\n",
+ "204.30894 9406.1\n",
+ "209.96466 21853.6\n",
+ "211.96245 65351.0\n",
+ "218.90355 9149.6\n",
+ "223.91072 11300.2\n",
+ "238.89684 12108.8\n",
+ "243.93825 10150.2\n",
+ "243.97040 10987.7\n",
+ "244.94121 8744.2\n",
+ "246.90314 11556.3\n",
+ "271.93225 29430.0\n",
+ "271.99219 51184.4\n",
+ "272.19150 31960.4\n",
+ "272.98602 35844.1\n",
+ "273.94431 11031.8\n",
+ "284.47998 8191.3\n",
+ "290.00125 66212.4\n",
+ "290.99539 54064.7\n",
+ "293.89490 10005.0\n",
+ "407.06372 10838.2\n",
+ "464.36697 9715.4\n",
+ "633.40036 633.40036\n",
+ "698.81390 9711.7\n",
+ "707.301117 707.301117\n",
+ "END IONS\n",
+ "BEGIN IONS\n",
+ "TITLE=02445a_BA7-TUM_HLA_7_01_01-DDA-1h-R1.23862.23862.2.0.dta\n",
+ "CHARGE=2+\n",
+ "RTINSECONDS=0.6455220\n",
+ "PEPMASS=287.427959\n",
+ "103.34669 5304.0\n",
+ "104.66884 5639.7\n",
+ "113.42419 6258.3\n",
+ "118.84039 5837.5\n",
+ "119.93203 13977.3\n",
+ "130.69589 6876.2\n",
+ "133.94824 43094.3\n",
+ "134.30524 7671.5\n",
+ "135.96359 9031.3\n",
+ "138.99994 8329.7\n",
+ "146.95573 31143.9\n",
+ "147.96323 12176.5\n",
+ "150.95151 65859.3\n",
+ "151.95818 24384.2\n",
+ "157.01105 19241.5\n",
+ "157.34985 7532.5\n",
+ "161.08838 7843.9\n",
+ "161.94234 20119.7\n",
+ "162.95146 60110.4\n",
+ "163.95877 183305.5\n",
+ "164.96657 13647.5\n",
+ "174.95139 150331.9\n",
+ "175.95258 21393.4\n",
+ "178.94460 11433.1\n",
+ "179.95316 13650.5\n",
+ "180.96204 15353.5\n",
+ "190.94572 30418.9\n",
+ "191.95422 61914.1\n",
+ "192.61461 8642.1\n",
+ "192.94395 12331.4\n",
+ "192.96207 132342.5\n",
+ "193.96318 19303.0\n",
+ "209.04164 25149.6\n",
+ "209.96368 154185.0\n",
+ "209.98361 12353.5\n",
+ "213.86244 11541.3\n",
+ "224.93071 12903.0\n",
+ "228.92879 8773.6\n",
+ "241.86043 135357.5\n",
+ "242.86113 20805.2\n",
+ "242.94327 26679.4\n",
+ "243.95219 29569.9\n",
+ "244.92361 12153.5\n",
+ "246.90300 16650.3\n",
+ "252.96521 73484.3\n",
+ "253.96646 11527.5\n",
+ "286.85858 10166.4\n",
+ "287.94186 18763.2\n",
+ "303.87665 39189.3\n",
+ "304.88116 11976.0\n",
+ "321.89087 97122.5\n",
+ "322.88867 28020.8\n",
+ "370.28696 9008.2\n",
+ "389.82578 13277.0\n",
+ "407.83545 12220.4\n",
+ "425.84872 13236.5\n",
+ "482.54852 10940.2\n",
+ "END IONS\n",
+ "BEGIN IONS\n",
+ "TITLE=02445a_BA7-TUM_HLA_7_01_01-DDA-1h-R1.23431.23431.2.0.dta\n",
+ "CHARGE=2+\n",
+ "RTINSECONDS=0.6455220\n",
+ "PEPMASS=287.427959\n",
+ "103.34669 5304.0\n",
+ "104.66884 5639.7\n",
+ "END IONS\n",
+ "BEGIN IONS\n",
+ "TITLE=02445a_BA7-TUM_HLA_7_01_01-DDA-1h-R1.32733.32733.2.0.dta\n",
+ "CHARGE=2+\n",
+ "RTINSECONDS=0.6455220\n",
+ "PEPMASS=287.427959\n",
+ "103.34669 5304.0\n",
+ "104.66884 5639.7\n",
+ "402.705571 402.705571\n",
+ "END IONS\n",
+ "BEGIN IONS\n",
+ "TITLE=02445a_BA7-TUM_HLA_7_01_01-DDA-1h-R1.23669.23669.2.0.dta\n",
+ "CHARGE=2+\n",
+ "RTINSECONDS=0.6455220\n",
+ "PEPMASS=287.427959\n",
+ "END IONS\n",
+ "\"\"\")\n",
+ "\n",
+ "ms_file_dict = {\n",
+ " 'raw': copy.deepcopy(mgf),\n",
+ " 'raw1': copy.deepcopy(mgf),\n",
+ "}\n",
+ "\n",
+ "psmlabel_str = '''File_Name\tSequence\tModification\tCharge\tScan_No\tProteins\tQ-value\tTarget/Decoy\tFinal_Score\n",
+ "raw.31809.31809.2.0.dta\tPSTDLLMLK\t2,Phospho[S];7,Oxidation[M];\t2\t31809\tProt\t0\ttarget\t100\n",
+ "raw.23862.23862.2.0.dta\tHTAYSDFLSDK\t\t2\t23862\tProt\t0\ttarget\t100\n",
+ "raw.23431.23431.2.0.dta\tHTAYSDFLSDK\t\t2\t23431\tProt\t0\ttarget\t100\n",
+ "raw.32733.32733.2.0.dta\tHFALFSTDVTK\t\t2\t32733\tProt\t0\ttarget\t100\n",
+ "raw.23669.23669.2.0.dta\tHTAYSDFLSDK\t\t2\t23669\tProt\t0\ttarget\t100\n",
+ "raw1.31809.31809.2.0.dta\tPSTDLLMLK\t2,Phospho[S];7,Oxidation[M];\t2\t31809\tProt\t0\ttarget\t100\n",
+ "raw1.23862.23862.2.0.dta\tHTAYSDFLSDK\t\t2\t23862\tProt\t0\ttarget\t100\n",
+ "raw1.23431.23431.2.0.dta\tHTAYSDFLSDK\t\t2\t23431\tProt\t0\ttarget\t100\n",
+ "raw1.32733.32733.2.0.dta\tHFALFSTDVTK\t\t2\t32733\tProt\t0\ttarget\t100\n",
+ "'''\n",
+ "reader = pFindReader()\n",
+ "reader.import_file(io.StringIO(psmlabel_str))\n",
+ "psm_df = reader.psm_df\n",
+ "matching = PepSpecMatch()\n",
+ "matching.match_ms2_multi_raw(psm_df, ms_file_dict, 'mgf')\n",
+ "merrs = matching.matched_mz_err_df.values\n",
+ "#np.sum(matching.matched_intensity_df.values!=0,axis=1)\n",
+ "assert len(merrs[~np.isinf(merrs)])==6\n",
+ "assert np.count_nonzero(matching.matched_intensity_df.values)==6"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 2/2 [00:00<00:00, 174.90it/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "#| hide\n",
+ "mgf_reader = ms_reader_provider.get_reader('mgf')\n",
+ "mgf_reader.import_raw(copy.deepcopy(mgf))\n",
+ "ms_file_dict = {'raw': mgf_reader}\n",
+ "mgf_reader1 = ms_reader_provider.get_reader('mgf')\n",
+ "mgf_reader1.import_raw(copy.deepcopy(mgf))\n",
+ "ms_file_dict['raw1'] = mgf_reader1\n",
+ "matching.match_ms2_multi_raw(psm_df, ms_file_dict, 'mgf')\n",
+ "merrs = matching.matched_mz_err_df.values\n",
+ "assert np.count_nonzero(matching.matched_intensity_df.values) == 6\n",
+ "assert len(merrs[~np.isinf(merrs)]) == 6"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 1/1 [00:00<00:00, 111.73it/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "#| hide\n",
+ "ms_file_dict = {\n",
+ " 'raw': copy.deepcopy(mgf),\n",
+ "}\n",
+ "reader = pFindReader()\n",
+ "reader.import_file(io.StringIO(psmlabel_str))\n",
+ "psm_df = reader.psm_df\n",
+ "psm_df = psm_df[~psm_df.raw_name.str.startswith('raw1')].copy()\n",
+ "matching = PepSpecMatch()\n",
+ "matching.match_ms2_multi_raw(psm_df, ms_file_dict, 'mgf')\n",
+ "matching.load_ms_data(copy.deepcopy(mgf), 'mgf')\n",
+ "df, frag_mz_df, frag_inten_df, frag_merr_df = matching.match_ms2_one_raw(\n",
+ " psm_df\n",
+ ")\n",
+ "assert (matching.fragment_mz_df==frag_mz_df).values.all()\n",
+ "assert (matching.matched_intensity_df==frag_inten_df).values.all()\n",
+ "assert (matching.matched_mz_err_df==frag_merr_df).values.all()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3.8.3 ('base')",
+ "language": "python",
+ "name": "python3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/nbs_tests/match/test_dia_matching.ipynb b/nbs_tests/match/test_dia_matching.ipynb
new file mode 100644
index 00000000..6e74716d
--- /dev/null
+++ b/nbs_tests/match/test_dia_matching.ipynb
@@ -0,0 +1,846 @@
+{
+ "cells": [
+ {
+ "metadata": {},
+ "cell_type": "code",
+ "outputs": [],
+ "execution_count": null,
+ "source": [
+ "%reload_ext autoreload\n",
+ "%autoreload 2"
+ ]
+ },
+ {
+ "metadata": {},
+ "cell_type": "code",
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "WARNING:root:WARNING: Temp mmap arrays are written to /var/folders/fh/hf8t3l1x02d42ggk3b304_rh0000gn/T/temp_mmap_37vmr95k. Cleanup of this folder is OS dependant, and might need to be triggered manually! Current space: 624,114,790,400\n",
+ "WARNING:root:WARNING: No Bruker libraries are available for this operating system. Mobility and m/z values need to be estimated. While this estimation often returns acceptable results with errors < 0.02 Th, huge errors (e.g. offsets of 6 Th) have already been observed for some samples!\n"
+ ]
+ }
+ ],
+ "execution_count": 2,
+ "source": "from peptdeep.match.psm_match import PepSpecMatch_DIA"
+ },
+ {
+ "metadata": {},
+ "cell_type": "code",
+ "outputs": [],
+ "execution_count": null,
+ "source": [
+ "psm_match = PepSpecMatch_DIA()\n",
+ "psm_match.min_frag_mz = 0\n",
+ "# psm_match.raw_data.save_hdf(\"/Users/wenfengzeng/data/orbi_dia/hla/20200317_QE_HFX2_LC3_DIA_RA957_R01.raw.hdf\")"
+ ]
+ },
+ {
+ "metadata": {},
+ "cell_type": "code",
+ "outputs": [],
+ "execution_count": null,
+ "source": [
+ "from alphabase.psm_reader.dia_psm_reader import DiannReader\n",
+ "psm_df = DiannReader().import_file(\"/Users/wenfengzeng/data/orbi_dia/hla/20200317_QE_HFX2_LC3_DIA_RA957_R01.tsv\")\n",
+ "psm_df"
+ ]
+ },
+ {
+ "metadata": {},
+ "cell_type": "code",
+ "outputs": [],
+ "execution_count": null,
+ "source": [
+ "ms_files = [\"/Users/wenfengzeng/data/orbi_dia/hla/20200317_QE_HFX2_LC3_DIA_RA957_R01.raw.hdf\"]\n",
+ "df,mz_df,inten_df,merr_df = psm_match.match_ms2_multi_raw(psm_df, ms_files)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " raw_name | \n",
+ " sequence | \n",
+ " charge | \n",
+ " rt | \n",
+ " rt_start | \n",
+ " rt_stop | \n",
+ " mobility | \n",
+ " proteins | \n",
+ " uniprot_ids | \n",
+ " genes | \n",
+ " ... | \n",
+ " fdr | \n",
+ " spec_idx | \n",
+ " mods | \n",
+ " mod_sites | \n",
+ " nAA | \n",
+ " rt_norm | \n",
+ " precursor_mz | \n",
+ " ccs | \n",
+ " frag_start_idx | \n",
+ " frag_stop_idx | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 20200317_QE_HFX2_LC3_DIA_RA957_R01 | \n",
+ " FPTSSPTL | \n",
+ " 1 | \n",
+ " 77.6566 | \n",
+ " 77.4414 | \n",
+ " 77.8715 | \n",
+ " 0 | \n",
+ " | \n",
+ " Q15696;Q15695 | \n",
+ " | \n",
+ " ... | \n",
+ " 0.001595 | \n",
+ " 55017 | \n",
+ " | \n",
+ " | \n",
+ " 8 | \n",
+ " 0.648554 | \n",
+ " 849.435260 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 20200317_QE_HFX2_LC3_DIA_RA957_R01 | \n",
+ " ANSAFVER | \n",
+ " 1 | \n",
+ " 46.3195 | \n",
+ " 46.1352 | \n",
+ " 46.5349 | \n",
+ " 0 | \n",
+ " | \n",
+ " P08238;Q58FF7;P07900;P07900-2 | \n",
+ " | \n",
+ " ... | \n",
+ " 0.000097 | \n",
+ " 32622 | \n",
+ " | \n",
+ " | \n",
+ " 8 | \n",
+ " 0.386840 | \n",
+ " 893.447557 | \n",
+ " 0.0 | \n",
+ " 7 | \n",
+ " 14 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 20200317_QE_HFX2_LC3_DIA_RA957_R01 | \n",
+ " ANVAEHQR | \n",
+ " 2 | \n",
+ " 23.8646 | \n",
+ " 23.7079 | \n",
+ " 24.0212 | \n",
+ " 0 | \n",
+ " | \n",
+ " Q12929;Q12929-2 | \n",
+ " | \n",
+ " ... | \n",
+ " 0.001492 | \n",
+ " 16745 | \n",
+ " | \n",
+ " | \n",
+ " 8 | \n",
+ " 0.199307 | \n",
+ " 462.735940 | \n",
+ " 0.0 | \n",
+ " 14 | \n",
+ " 21 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 20200317_QE_HFX2_LC3_DIA_RA957_R01 | \n",
+ " APAAGEEG | \n",
+ " 1 | \n",
+ " 19.4263 | \n",
+ " 19.2696 | \n",
+ " 19.5829 | \n",
+ " 0 | \n",
+ " | \n",
+ " Q9BYE7-3;Q9BYE7 | \n",
+ " | \n",
+ " ... | \n",
+ " 0.001219 | \n",
+ " 13631 | \n",
+ " | \n",
+ " | \n",
+ " 8 | \n",
+ " 0.162240 | \n",
+ " 701.310060 | \n",
+ " 0.0 | \n",
+ " 21 | \n",
+ " 28 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 20200317_QE_HFX2_LC3_DIA_RA957_R01 | \n",
+ " HRIVSISL | \n",
+ " 2 | \n",
+ " 67.9021 | \n",
+ " 67.7177 | \n",
+ " 68.0252 | \n",
+ " 0 | \n",
+ " | \n",
+ " Q5VY09 | \n",
+ " | \n",
+ " ... | \n",
+ " 0.000713 | \n",
+ " 48029 | \n",
+ " | \n",
+ " | \n",
+ " 8 | \n",
+ " 0.567089 | \n",
+ " 462.784902 | \n",
+ " 0.0 | \n",
+ " 28 | \n",
+ " 35 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 60421 | \n",
+ " 20200317_QE_HFX2_LC3_DIA_RA957_R01 | \n",
+ " TPSKQSNNKYAASSYLSLTPEQWK | \n",
+ " 3 | \n",
+ " 78.7641 | \n",
+ " 78.6414 | \n",
+ " 78.8562 | \n",
+ " 0 | \n",
+ " | \n",
+ " P0DOY2;P0CF74;P0DOY3 | \n",
+ " | \n",
+ " ... | \n",
+ " 0.000097 | \n",
+ " 55810 | \n",
+ " | \n",
+ " | \n",
+ " 24 | \n",
+ " 0.657804 | \n",
+ " 910.124079 | \n",
+ " 0.0 | \n",
+ " 534642 | \n",
+ " 534665 | \n",
+ "
\n",
+ " \n",
+ " | 60422 | \n",
+ " 20200317_QE_HFX2_LC3_DIA_RA957_R01 | \n",
+ " VDDTQFVRFDSDAASPRTEPRAPW | \n",
+ " 3 | \n",
+ " 88.9499 | \n",
+ " 88.7348 | \n",
+ " 89.1031 | \n",
+ " 0 | \n",
+ " | \n",
+ " P30490;P30685;P30491;P10319;P18463 | \n",
+ " | \n",
+ " ... | \n",
+ " 0.000516 | \n",
+ " 63092 | \n",
+ " | \n",
+ " | \n",
+ " 24 | \n",
+ " 0.742871 | \n",
+ " 921.775562 | \n",
+ " 0.0 | \n",
+ " 534665 | \n",
+ " 534688 | \n",
+ "
\n",
+ " \n",
+ " | 60423 | \n",
+ " 20200317_QE_HFX2_LC3_DIA_RA957_R01 | \n",
+ " VDDTQFVRFDSDAASPRGEPREPW | \n",
+ " 3 | \n",
+ " 86.2703 | \n",
+ " 86.1779 | \n",
+ " 86.4238 | \n",
+ " 0 | \n",
+ " | \n",
+ " P30504 | \n",
+ " | \n",
+ " ... | \n",
+ " 0.000399 | \n",
+ " 61178 | \n",
+ " | \n",
+ " | \n",
+ " 24 | \n",
+ " 0.720492 | \n",
+ " 926.435317 | \n",
+ " 0.0 | \n",
+ " 534688 | \n",
+ " 534711 | \n",
+ "
\n",
+ " \n",
+ " | 60424 | \n",
+ " 20200317_QE_HFX2_LC3_DIA_RA957_R01 | \n",
+ " ASGSGAQVGGPISSGSSASSVTVTR | \n",
+ " 3 | \n",
+ " 48.6067 | \n",
+ " 48.4827 | \n",
+ " 48.6992 | \n",
+ " 0 | \n",
+ " | \n",
+ " P02545;P02545-3;P02545-5;P02545-4 | \n",
+ " | \n",
+ " ... | \n",
+ " 0.000713 | \n",
+ " 34246 | \n",
+ " | \n",
+ " | \n",
+ " 25 | \n",
+ " 0.405942 | \n",
+ " 736.368132 | \n",
+ " 0.0 | \n",
+ " 534711 | \n",
+ " 534735 | \n",
+ "
\n",
+ " \n",
+ " | 60425 | \n",
+ " 20200317_QE_HFX2_LC3_DIA_RA957_R01 | \n",
+ " SGGGGGGSSGGRGSGGGSSGGSIGG | \n",
+ " 2 | \n",
+ " 18.2415 | \n",
+ " 18.1475 | \n",
+ " 18.3042 | \n",
+ " 0 | \n",
+ " | \n",
+ " P04264;CON__P04264 | \n",
+ " | \n",
+ " ... | \n",
+ " 0.000097 | \n",
+ " 12800 | \n",
+ " | \n",
+ " | \n",
+ " 25 | \n",
+ " 0.152345 | \n",
+ " 905.388955 | \n",
+ " 0.0 | \n",
+ " 534735 | \n",
+ " 534759 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
60426 rows × 22 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " raw_name sequence charge \\\n",
+ "0 20200317_QE_HFX2_LC3_DIA_RA957_R01 FPTSSPTL 1 \n",
+ "1 20200317_QE_HFX2_LC3_DIA_RA957_R01 ANSAFVER 1 \n",
+ "2 20200317_QE_HFX2_LC3_DIA_RA957_R01 ANVAEHQR 2 \n",
+ "3 20200317_QE_HFX2_LC3_DIA_RA957_R01 APAAGEEG 1 \n",
+ "4 20200317_QE_HFX2_LC3_DIA_RA957_R01 HRIVSISL 2 \n",
+ "... ... ... ... \n",
+ "60421 20200317_QE_HFX2_LC3_DIA_RA957_R01 TPSKQSNNKYAASSYLSLTPEQWK 3 \n",
+ "60422 20200317_QE_HFX2_LC3_DIA_RA957_R01 VDDTQFVRFDSDAASPRTEPRAPW 3 \n",
+ "60423 20200317_QE_HFX2_LC3_DIA_RA957_R01 VDDTQFVRFDSDAASPRGEPREPW 3 \n",
+ "60424 20200317_QE_HFX2_LC3_DIA_RA957_R01 ASGSGAQVGGPISSGSSASSVTVTR 3 \n",
+ "60425 20200317_QE_HFX2_LC3_DIA_RA957_R01 SGGGGGGSSGGRGSGGGSSGGSIGG 2 \n",
+ "\n",
+ " rt rt_start rt_stop mobility proteins \\\n",
+ "0 77.6566 77.4414 77.8715 0 \n",
+ "1 46.3195 46.1352 46.5349 0 \n",
+ "2 23.8646 23.7079 24.0212 0 \n",
+ "3 19.4263 19.2696 19.5829 0 \n",
+ "4 67.9021 67.7177 68.0252 0 \n",
+ "... ... ... ... ... ... \n",
+ "60421 78.7641 78.6414 78.8562 0 \n",
+ "60422 88.9499 88.7348 89.1031 0 \n",
+ "60423 86.2703 86.1779 86.4238 0 \n",
+ "60424 48.6067 48.4827 48.6992 0 \n",
+ "60425 18.2415 18.1475 18.3042 0 \n",
+ "\n",
+ " uniprot_ids genes ... fdr spec_idx \\\n",
+ "0 Q15696;Q15695 ... 0.001595 55017 \n",
+ "1 P08238;Q58FF7;P07900;P07900-2 ... 0.000097 32622 \n",
+ "2 Q12929;Q12929-2 ... 0.001492 16745 \n",
+ "3 Q9BYE7-3;Q9BYE7 ... 0.001219 13631 \n",
+ "4 Q5VY09 ... 0.000713 48029 \n",
+ "... ... ... ... ... ... \n",
+ "60421 P0DOY2;P0CF74;P0DOY3 ... 0.000097 55810 \n",
+ "60422 P30490;P30685;P30491;P10319;P18463 ... 0.000516 63092 \n",
+ "60423 P30504 ... 0.000399 61178 \n",
+ "60424 P02545;P02545-3;P02545-5;P02545-4 ... 0.000713 34246 \n",
+ "60425 P04264;CON__P04264 ... 0.000097 12800 \n",
+ "\n",
+ " mods mod_sites nAA rt_norm precursor_mz ccs frag_start_idx \\\n",
+ "0 8 0.648554 849.435260 0.0 0 \n",
+ "1 8 0.386840 893.447557 0.0 7 \n",
+ "2 8 0.199307 462.735940 0.0 14 \n",
+ "3 8 0.162240 701.310060 0.0 21 \n",
+ "4 8 0.567089 462.784902 0.0 28 \n",
+ "... ... ... .. ... ... ... ... \n",
+ "60421 24 0.657804 910.124079 0.0 534642 \n",
+ "60422 24 0.742871 921.775562 0.0 534665 \n",
+ "60423 24 0.720492 926.435317 0.0 534688 \n",
+ "60424 25 0.405942 736.368132 0.0 534711 \n",
+ "60425 25 0.152345 905.388955 0.0 534735 \n",
+ "\n",
+ " frag_stop_idx \n",
+ "0 7 \n",
+ "1 14 \n",
+ "2 21 \n",
+ "3 28 \n",
+ "4 35 \n",
+ "... ... \n",
+ "60421 534665 \n",
+ "60422 534688 \n",
+ "60423 534711 \n",
+ "60424 534735 \n",
+ "60425 534759 \n",
+ "\n",
+ "[60426 rows x 22 columns]"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " b_z1 | \n",
+ " b_z2 | \n",
+ " y_z1 | \n",
+ " y_z2 | \n",
+ " b_modloss_z1 | \n",
+ " b_modloss_z2 | \n",
+ " y_modloss_z1 | \n",
+ " y_modloss_z2 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 190300.578125 | \n",
+ " 0.0 | \n",
+ " 37765.183594 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 534754 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 534755 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 534756 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 534757 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 534758 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
534759 rows × 8 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " b_z1 b_z2 y_z1 y_z2 b_modloss_z1 b_modloss_z2 \\\n",
+ "0 0.000000 0.0 0.000000 0.0 0.0 0.0 \n",
+ "1 190300.578125 0.0 37765.183594 0.0 0.0 0.0 \n",
+ "2 0.000000 0.0 0.000000 0.0 0.0 0.0 \n",
+ "3 0.000000 0.0 0.000000 0.0 0.0 0.0 \n",
+ "4 0.000000 0.0 0.000000 0.0 0.0 0.0 \n",
+ "... ... ... ... ... ... ... \n",
+ "534754 0.000000 0.0 0.000000 0.0 0.0 0.0 \n",
+ "534755 0.000000 0.0 0.000000 0.0 0.0 0.0 \n",
+ "534756 0.000000 0.0 0.000000 0.0 0.0 0.0 \n",
+ "534757 0.000000 0.0 0.000000 0.0 0.0 0.0 \n",
+ "534758 0.000000 0.0 0.000000 0.0 0.0 0.0 \n",
+ "\n",
+ " y_modloss_z1 y_modloss_z2 \n",
+ "0 0.0 0.0 \n",
+ "1 0.0 0.0 \n",
+ "2 0.0 0.0 \n",
+ "3 0.0 0.0 \n",
+ "4 0.0 0.0 \n",
+ "... ... ... \n",
+ "534754 0.0 0.0 \n",
+ "534755 0.0 0.0 \n",
+ "534756 0.0 0.0 \n",
+ "534757 0.0 0.0 \n",
+ "534758 0.0 0.0 \n",
+ "\n",
+ "[534759 rows x 8 columns]"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "inten_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " PCC | \n",
+ " COS | \n",
+ " SA | \n",
+ " SPC | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | count | \n",
+ " 20142.000000 | \n",
+ " 20142.000000 | \n",
+ " 20142.000000 | \n",
+ " 20142.000000 | \n",
+ "
\n",
+ " \n",
+ " | mean | \n",
+ " 0.793184 | \n",
+ " 0.801761 | \n",
+ " 0.662681 | \n",
+ " 0.786549 | \n",
+ "
\n",
+ " \n",
+ " | std | \n",
+ " 0.259575 | \n",
+ " 0.252218 | \n",
+ " 0.251722 | \n",
+ " 0.137454 | \n",
+ "
\n",
+ " \n",
+ " | min | \n",
+ " -0.050093 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.026527 | \n",
+ "
\n",
+ " \n",
+ " | 25% | \n",
+ " 0.722388 | \n",
+ " 0.736639 | \n",
+ " 0.530222 | \n",
+ " 0.704215 | \n",
+ "
\n",
+ " \n",
+ " | 50% | \n",
+ " 0.900511 | \n",
+ " 0.905750 | \n",
+ " 0.724400 | \n",
+ " 0.805559 | \n",
+ "
\n",
+ " \n",
+ " | 75% | \n",
+ " 0.972494 | \n",
+ " 0.973826 | \n",
+ " 0.855859 | \n",
+ " 0.894600 | \n",
+ "
\n",
+ " \n",
+ " | max | \n",
+ " 1.000003 | \n",
+ " 1.000002 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " | >0.90 | \n",
+ " 0.510807 | \n",
+ " 0.522722 | \n",
+ " 0.167494 | \n",
+ " 0.260136 | \n",
+ "
\n",
+ " \n",
+ " | >0.75 | \n",
+ " 0.726856 | \n",
+ " 0.739632 | \n",
+ " 0.461060 | \n",
+ " 0.635836 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " PCC COS SA SPC\n",
+ "count 20142.000000 20142.000000 20142.000000 20142.000000\n",
+ "mean 0.793184 0.801761 0.662681 0.786549\n",
+ "std 0.259575 0.252218 0.251722 0.137454\n",
+ "min -0.050093 0.000000 0.000000 0.026527\n",
+ "25% 0.722388 0.736639 0.530222 0.704215\n",
+ "50% 0.900511 0.905750 0.724400 0.805559\n",
+ "75% 0.972494 0.973826 0.855859 0.894600\n",
+ "max 1.000003 1.000002 1.000000 1.000000\n",
+ ">0.90 0.510807 0.522722 0.167494 0.260136\n",
+ ">0.75 0.726856 0.739632 0.461060 0.635836"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from peptdeep.model.ms2 import calc_ms2_similarity\n",
+ "metrics_list = []\n",
+ "frag_len = len(inten_df)//psm_match.max_spec_per_query\n",
+ "_df = df.iloc[:len(df)//psm_match.max_spec_per_query].copy()\n",
+ "for i in range(psm_match.max_spec_per_query):\n",
+ " for j in range(i+1,psm_match.max_spec_per_query):\n",
+ " _df,metrics_df = calc_ms2_similarity(\n",
+ " _df,\n",
+ " inten_df[i*frag_len:(i+1)*frag_len],\n",
+ " inten_df[j*frag_len:(j+1)*frag_len],\n",
+ " )\n",
+ " metrics_list.append(metrics_df)\n",
+ "sum(metrics_list)/len(metrics_list)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "base",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.12"
+ },
+ "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/peptdeep/mass_spec/match.py b/peptdeep/mass_spec/match.py
index 7e9f8041..eff9cd85 100644
--- a/peptdeep/mass_spec/match.py
+++ b/peptdeep/mass_spec/match.py
@@ -213,7 +213,10 @@ def match_one_raw_with_numba(
class PepSpecMatch(object):
- """Main entry for peptide-spectrum matching"""
+ """Main entry for peptide-spectrum matching.
+
+ TODO: figure out relation with `peptdeep.match.psm_match.PepSpecMatch`.
+ """
def __init__(
self,
diff --git a/peptdeep/match/__init__.py b/peptdeep/match/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/peptdeep/match/normal_dia.py b/peptdeep/match/normal_dia.py
new file mode 100644
index 00000000..cce2ebf0
--- /dev/null
+++ b/peptdeep/match/normal_dia.py
@@ -0,0 +1,50 @@
+"""DIA grouping for normal DIA data, which is not acquired with ion mobility separation. Moved from alpharaw."""
+
+import typing
+from collections import defaultdict
+
+import numpy as np
+
+from alpharaw.ms_data_base import MSData_Base
+
+
+class NormalDIAGrouper:
+ def __init__(self, ms_data: MSData_Base):
+ self.ms_data = ms_data
+ self.ms_data.spectrum_df["dia_group"] = ms_data.spectrum_df.precursor_mz.astype(
+ int
+ )
+
+ self.dia_group_dfs = self.ms_data.spectrum_df.groupby("dia_group")
+ self.dia_isolation_dict = {}
+ for dia_group, df in self.dia_group_dfs:
+ if dia_group == -1:
+ continue
+ self.dia_isolation_dict[dia_group] = (
+ df.isolation_lower_mz.values[0],
+ df.isolation_upper_mz.values[0],
+ )
+ self.dia_groups = np.sort(list(self.dia_isolation_dict.keys()))
+
+ def assign_dia_groups(
+ self, precursor_mzs
+ ) -> typing.DefaultDict[typing.List, typing.List]:
+ dia_precursor_groups = defaultdict(list)
+ for i, mz in enumerate(precursor_mzs):
+ i_group = np.searchsorted(self.dia_groups, int(mz))
+ if i_group == 0:
+ i_group = 1
+ elif i_group == len(self.dia_groups):
+ i_group -= 1
+ dia_group = self.dia_groups[i_group]
+ if dia_group in self.dia_isolation_dict:
+ isolation_lower, isolation_upper = self.dia_isolation_dict[dia_group]
+ if mz >= isolation_lower and mz <= isolation_upper:
+ dia_precursor_groups[dia_group].append(i)
+ continue
+ dia_group = self.dia_groups[i_group - 1]
+ if dia_group in self.dia_isolation_dict:
+ isolation_lower, isolation_upper = self.dia_isolation_dict[dia_group]
+ if mz >= isolation_lower and mz <= isolation_upper:
+ dia_precursor_groups[dia_group].append(i)
+ return dia_precursor_groups
diff --git a/peptdeep/match/psm_match.py b/peptdeep/match/psm_match.py
new file mode 100644
index 00000000..e34c1d09
--- /dev/null
+++ b/peptdeep/match/psm_match.py
@@ -0,0 +1,615 @@
+"""This module provides the `PepSpecMatch` class to match fragment ions of PSMs. Moved from alpharaw."""
+
+from typing import Tuple, Union
+
+import numba
+import numpy as np
+import pandas as pd
+import tqdm
+from alphabase.peptide.fragment import (
+ create_fragment_mz_dataframe,
+ get_charged_frag_types,
+)
+
+from peptdeep.match.utils import load_ms_data, match_one_raw_with_numba
+from peptdeep.match.normal_dia import NormalDIAGrouper
+from alpharaw import register_all_readers
+from alpharaw.match.match_utils import (
+ match_closest_peaks,
+ match_highest_peaks,
+)
+from alpharaw.match.spec_finder import find_dia_spec_idxes_same_window
+from alpharaw.ms_data_base import (
+ PEAK_INTENSITY_DTYPE,
+ PEAK_MZ_DTYPE,
+ MSData_Base,
+)
+from alpharaw.utils.ms_path_utils import parse_ms_files_to_dict
+
+register_all_readers() # TODO remove this import side effect
+
+
+class PepSpecMatch:
+ """
+ Extract fragment ions from MS2 data.
+ The extracted information can be used for visualization of peak annotation or
+ PeptDeep transfer learnining for the MS2 model.
+
+ TODO: figure out relation with `peptdeep.mass_spec.match.PepSpecMatch`.
+ """
+
+ match_closest: bool = True
+ use_ppm: bool = True
+ #: matching mass tolerance
+ tolerance: float = 20.0
+ ms_loader_thread_num: int = 4
+
+ def __init__(
+ self,
+ charged_frag_types: list = None,
+ match_closest: bool = True,
+ use_ppm: bool = True,
+ tol_value: float = 20.0,
+ ):
+ """
+ Parameters
+ ----------
+ charged_frag_types : list, optional
+ fragment types with charge states,
+ e.g. ['b_z1', 'y_z2', 'b_modloss_z1', 'y_H2O_z2'].
+ Defaults to `get_charged_frag_types(['b','y','b_modloss','y_modloss'], 2)`.
+
+ match_closest : bool, optional
+ if True, match the closest peak for a m/z;
+ if False, matched the higest peak for a m/z in the tolerance range.
+ By default True.
+
+ use_ppm : bool, optional
+ If use ppm other wise Da, by default True.
+
+ tol_value : float, optional
+ Matching tolerance value (ppm or Da based on `use_ppm`)
+ for peak annotation, by default 20.0
+ """
+ self.charged_frag_types = (
+ get_charged_frag_types(["b", "y", "b_modloss", "y_modloss"], 2)
+ if charged_frag_types is None
+ else charged_frag_types
+ )
+ self.match_closest = match_closest
+ self.use_ppm = use_ppm
+ self.tolerance = tol_value
+
+ def get_fragment_mz_df(self) -> pd.DataFrame:
+ """
+ Call :func:`alphabase.peptide.fragment.create_fragment_mz_dataframe`
+ for :attr:`PepSpecMatch.psm_df` and :attr:`PepSpecMatch.charged_frag_types`.
+
+
+ Returns
+ -------
+ DataFrame
+ The fragment m/z dataframe in alphabase format.
+ """
+ return create_fragment_mz_dataframe(
+ self.psm_df,
+ self.charged_frag_types,
+ dtype=PEAK_MZ_DTYPE,
+ )
+
+ def _add_missing_columns_to_psm_df(
+ self, psm_df: pd.DataFrame, raw_data: MSData_Base = None
+ ):
+ """
+ Add missing "rt", "nce", "rt_norm", ("mobility") columns to `psm_df` inplace if missing.
+
+ Parameters
+ ----------
+ psm_df : pd.DataFrame
+ psm dataframe to be processed.
+ raw_data : MSData_Base, optional
+ The `MSData_Base`. If None, `self.raw_data`. by default None.
+
+ Returns
+ -------
+ DataFrame
+ The original `psm_df` with missing columns added.
+ """
+ if raw_data is None:
+ raw_data = self.raw_data
+ add_spec_info_list = []
+ if "rt" not in psm_df.columns:
+ add_spec_info_list.append("rt")
+
+ if "nce" in raw_data.spectrum_df.columns and "nce" not in psm_df.columns:
+ add_spec_info_list.append("nce")
+
+ if (
+ "mobility" not in psm_df.columns
+ and "mobility" in raw_data.spectrum_df.columns
+ ):
+ add_spec_info_list.append("mobility")
+
+ if len(add_spec_info_list) > 0:
+ # pfind does not report RT in the result file
+ psm_df = (
+ psm_df.reset_index()
+ .merge(
+ raw_data.spectrum_df[["spec_idx"] + add_spec_info_list],
+ how="left",
+ on="spec_idx",
+ )
+ .set_index("index")
+ )
+
+ if "rt" in add_spec_info_list:
+ psm_df["rt_norm"] = psm_df.rt / raw_data.spectrum_df.rt.max()
+ # if 'rt_sec' not in psm_df.columns:
+ # psm_df['rt_sec'] = psm_df.rt*60
+ return psm_df
+
+ def _prepare_matching_dfs(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+ """
+ Prepare empty `fragment_mz_df`, `matched_intensity_df`,
+ and `matched_mz_err_df` dataframes to extract peak matching information
+ for `self.psm_df`. These three dataframes will be only used internally
+ in :class:`PepSpecMatch` objects.
+
+ Returns
+ -------
+ Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]
+ pd.DataFrame: fragment mz dataframe.
+
+ pd.DataFrame: intensity dataframe to match.
+
+ pd.DataFrame: mz error dataframe to match.
+ """
+ fragment_mz_df = self.get_fragment_mz_df()
+
+ matched_intensity_df = pd.DataFrame(
+ np.zeros_like(fragment_mz_df.values, dtype=PEAK_INTENSITY_DTYPE),
+ columns=fragment_mz_df.columns,
+ )
+
+ matched_mz_err_df = pd.DataFrame(
+ np.full_like(fragment_mz_df.values, np.inf, dtype=PEAK_MZ_DTYPE),
+ columns=fragment_mz_df.columns,
+ )
+ return (fragment_mz_df, matched_intensity_df, matched_mz_err_df)
+
+ def load_ms_data(
+ self,
+ ms_file: Union[str, MSData_Base],
+ ms_file_type: str = "alpharaw_hdf",
+ process_count: int = 8,
+ **kwargs,
+ ):
+ """Load MS file to set `self.raw_data`.
+
+ Parameters
+ ----------
+ ms_file : str | MSData_Base
+ Absolute or relative path of the ms2 file.
+
+ ms_file_type : str, optional
+ ms2 file type, could be
+ ["alpharaw_hdf","thermo","sciex","alphapept_hdf","mgf"].
+ Default to 'alpharaw_hdf'.
+ """
+ self.raw_data = load_ms_data(ms_file, ms_file_type, process_count=process_count)
+
+ def get_peaks(self, spec_idx: int, **kwargs):
+ return self.raw_data.get_peaks(spec_idx)
+
+ def _match_one_psm(
+ self,
+ peak_mzs: np.ndarray,
+ peak_intensities: np.ndarray,
+ fragment_mz_df: pd.DataFrame,
+ matched_intensity_df: pd.DataFrame,
+ matched_mz_err_df: pd.DataFrame,
+ frag_start_idx: int,
+ frag_stop_idx: int,
+ ):
+ """
+ Match fragments of one precursor (located by `frag_start_idx` and `frag_stop_idx`)
+ against the corresponding `peak_mzs`.
+
+ Parameters
+ ----------
+ peak_mzs : np.ndarray
+ Peak m/z values to be matched.
+ peak_intensities : np.ndarray
+ Peak intensities to be matched.
+ fragment_mz_df : pd.DataFrame
+ fragment m/z dataframe to be matched.
+ matched_intensity_df : pd.DataFrame
+ The dataframe to store matched intensity values.
+ matched_mz_err_df : pd.DataFrame
+ The dataframe to store matched mz error values.
+ frag_start_idx : int
+ fragment start index of the given PSM.
+ frag_stop_idx : int
+ fragment stop index of the given PSM.
+ """
+ if len(peak_mzs) == 0:
+ return
+
+ peak_mzs = peak_mzs.astype(PEAK_MZ_DTYPE)
+
+ frag_mzs = fragment_mz_df.values[frag_start_idx:frag_stop_idx, :]
+
+ if self.use_ppm:
+ mz_tols = frag_mzs * self.tolerance * 1e-6
+ else:
+ mz_tols = np.full_like(frag_mzs, self.tolerance)
+
+ if self.match_closest:
+ matched_idxes = match_closest_peaks(
+ peak_mzs, peak_intensities, frag_mzs, mz_tols
+ )
+ else:
+ matched_idxes = match_highest_peaks(
+ peak_mzs,
+ peak_intensities,
+ frag_mzs,
+ mz_tols,
+ )
+
+ matched_intens = peak_intensities[matched_idxes]
+ matched_intens[matched_idxes == -1] = 0
+
+ matched_mz_errs = np.abs(peak_mzs[matched_idxes] - frag_mzs)
+ matched_mz_errs[matched_idxes == -1] = np.inf
+
+ matched_intensity_df.values[frag_start_idx:frag_stop_idx, :] = matched_intens
+
+ matched_mz_err_df.values[frag_start_idx:frag_stop_idx, :] = matched_mz_errs
+
+ def match_ms2_one_raw(
+ self,
+ psm_df_one_raw: pd.DataFrame,
+ verbose: bool = False,
+ ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+ """
+ Matching psm_df_one_raw against self.raw_data
+ after `self.load_ms_data()`
+
+ Parameters
+ ----------
+ psm_df_one_raw : pd.DataFrame
+ psm dataframe
+ that contains only one raw file
+
+ Returns
+ -------
+ Tuple:
+ pd.DataFrame: psm dataframe with fragment index information.
+
+ pd.DataFrame: fragment mz dataframe.
+
+ pd.DataFrame: matched intensity dataframe.
+
+ pd.DataFrame: matched mass error dataframe.
+ np.inf if a fragment is not matched.
+ """
+ self.psm_df = psm_df_one_raw
+
+ psm_df_one_raw = self._add_missing_columns_to_psm_df(
+ psm_df_one_raw, self.raw_data
+ )
+
+ (
+ fragment_mz_df,
+ matched_intensity_df,
+ matched_mz_err_df,
+ ) = self._prepare_matching_dfs()
+
+ psm_iters = psm_df_one_raw[
+ ["spec_idx", "frag_start_idx", "frag_stop_idx"]
+ ].values
+ if verbose:
+ psm_iters = tqdm.tqdm(psm_iters)
+
+ for spec_idx, frag_start_idx, frag_stop_idx in psm_iters:
+ (spec_mzs, spec_intens) = self.get_peaks(spec_idx)
+
+ self._match_one_psm(
+ spec_mzs,
+ spec_intens,
+ fragment_mz_df,
+ matched_intensity_df,
+ matched_mz_err_df,
+ frag_start_idx,
+ frag_stop_idx,
+ )
+
+ return (psm_df_one_raw, fragment_mz_df, matched_intensity_df, matched_mz_err_df)
+
+ def _match_ms2_one_raw_numba(self, raw_name: str, psm_df_one_raw: pd.DataFrame):
+ if raw_name in self._ms_file_dict:
+ raw_data = load_ms_data(
+ self._ms_file_dict[raw_name],
+ self._ms_file_type,
+ process_count=self.ms_loader_thread_num,
+ )
+ if raw_data is None:
+ return
+
+ psm_df_one_raw = self._add_missing_columns_to_psm_df(
+ psm_df_one_raw, raw_data
+ )
+
+ if self.use_ppm:
+ all_frag_mz_tols = self.fragment_mz_df.values * self.tolerance * 1e-6
+ else:
+ all_frag_mz_tols = np.full_like(
+ self.fragment_mz_df.values, self.tolerance
+ )
+
+ match_one_raw_with_numba(
+ psm_df_one_raw.spec_idx.values,
+ psm_df_one_raw.frag_start_idx.values,
+ psm_df_one_raw.frag_stop_idx.values,
+ self.fragment_mz_df.values,
+ all_frag_mz_tols,
+ raw_data.peak_df.mz.values,
+ raw_data.peak_df.intensity.values,
+ raw_data.spectrum_df.peak_start_idx.values,
+ raw_data.spectrum_df.peak_stop_idx.values,
+ self.matched_intensity_df.values,
+ self.matched_mz_err_df.values,
+ self.match_closest,
+ )
+ else:
+ print(f"`{raw_name}` is not found in ms_file_dict.")
+ return
+ return psm_df_one_raw
+
+ def match_ms2_multi_raw(
+ self,
+ psm_df: pd.DataFrame,
+ ms_files: Union[dict, list],
+ ms_file_type: str = "alpharaw_hdf",
+ process_num: int = 1,
+ ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+ """
+ Matching PSM dataframe against the ms2 files in ms_files
+ This method will store matched values as attributes:
+ - self.psm_df
+ - self.fragment_mz_df
+ - self.matched_intensity_df
+ - self.matched_mz_err_df
+
+ Parameters
+ ----------
+ psm_df : pd.DataFrame
+ PSM dataframe
+
+ ms_files : dict | list
+ if dict: {raw_name: ms2 path}
+ if list: [ms2 path1, ms2 path2]
+
+ ms_file_type : str, optional
+ Could be 'alpharaw_hdf', 'mgf' or 'thermo', 'sciex', 'alphapept_hdf'.
+ Defaults to 'alphapept'.
+
+ Returns
+ -------
+ Tuple:
+ pd.DataFrame: psm dataframe with fragment index information.
+
+ pd.DataFrame: fragment mz dataframe.
+
+ pd.DataFrame: matched intensity dataframe.
+
+ pd.DataFrame: matched mass error dataframe.
+ np.inf if a fragment is not matched.
+
+ """
+ self.psm_df = psm_df
+
+ (
+ self.fragment_mz_df,
+ self.matched_intensity_df,
+ self.matched_mz_err_df,
+ ) = self._prepare_matching_dfs()
+
+ if isinstance(ms_files, dict):
+ self._ms_file_dict = ms_files
+ else:
+ self._ms_file_dict = parse_ms_files_to_dict(ms_files)
+
+ self._ms_file_type = ms_file_type
+
+ self.ms_loader_thread_num = process_num
+ # if process_num <= 1 or len(self._ms_file_dict) <= 1:
+ psm_df_list = []
+ for raw_name, df_group in tqdm.tqdm(self.psm_df.groupby("raw_name")):
+ _df = self._match_ms2_one_raw_numba(raw_name, df_group)
+ if _df is not None:
+ psm_df_list.append(_df)
+ # else:
+ # with mp.get_context("spawn").Pool(processes=process_num) as p:
+ # df_groupby = self.psm_df.groupby('raw_name')
+ # def gen_group_df(df_groupby):
+ # for raw_name, df_group in df_groupby:
+ # yield (raw_name, df_group)
+ # process_bar(
+ # p.imap_unordered(
+ # self._match_ms2_one_raw_numba,
+ # gen_group_df(df_groupby)
+ # ), df_groupby.ngroups
+ # )
+
+ self.psm_df = pd.concat(psm_df_list, ignore_index=True)
+ return (
+ self.psm_df,
+ self.fragment_mz_df,
+ self.matched_intensity_df,
+ self.matched_mz_err_df,
+ )
+
+
+class PepSpecMatch_DIA(PepSpecMatch):
+ """
+ Peak annotation for DIA data.
+ """
+
+ max_spec_per_query: int = 3
+ min_frag_mz: float = 200.0
+
+ def _add_missing_columns_to_psm_df(self, psm_df: pd.DataFrame, raw_data=None):
+ # DIA results do not have spec_idx/scan_num in psm_df, nothing to merge
+ return psm_df
+
+ def _prepare_matching_dfs(self):
+ fragment_mz_df = self.get_fragment_mz_df()
+ fragment_mz_df = pd.concat(
+ [fragment_mz_df] * self.max_spec_per_query, ignore_index=True
+ )
+ if self.use_ppm:
+ self.all_frag_mz_tols = fragment_mz_df.values * self.tolerance * 1e-6
+ else:
+ self.all_frag_mz_tols = np.full_like(fragment_mz_df.values, self.tolerance)
+
+ psm_df_list = []
+ len_frags = len(fragment_mz_df) // self.max_spec_per_query
+ for i in range(self.max_spec_per_query):
+ psm_df = self.psm_df.copy()
+ psm_df["frag_start_idx"] = psm_df.frag_start_idx + i * len_frags
+ psm_df["frag_stop_idx"] = psm_df.frag_stop_idx + i * len_frags
+ psm_df_list.append(psm_df)
+ self.psm_df = pd.concat(psm_df_list, ignore_index=True)
+
+ matched_intensity_df = pd.DataFrame(
+ np.zeros_like(fragment_mz_df.values, dtype=PEAK_INTENSITY_DTYPE),
+ columns=fragment_mz_df.columns,
+ )
+
+ matched_mz_err_df = pd.DataFrame(
+ np.zeros_like(fragment_mz_df.values, dtype=PEAK_MZ_DTYPE),
+ columns=fragment_mz_df.columns,
+ )
+
+ return (fragment_mz_df, matched_intensity_df, matched_mz_err_df)
+
+ def _match_ms2_one_raw_numba(
+ self, raw_name: str, psm_df_one_raw: pd.DataFrame
+ ) -> pd.DataFrame:
+ """
+ Internal method to extract peak information with numba as backend.
+
+ Parameters
+ ----------
+ raw_name : str
+ The raw name of the raw file. `psm_df_one_raw` dataframe should also
+ contain the same raw name in `raw_name` column.
+ psm_df_one_raw : pd.DataFrame
+ The dataframe for PSMs.
+
+ Returns
+ -------
+ pd.DataFrame
+ `psm_df_one_raw`
+ """
+ psm_df_one_raw = psm_df_one_raw.reset_index(drop=True)
+
+ if raw_name in self._ms_file_dict:
+ raw_data = load_ms_data(
+ self._ms_file_dict[raw_name],
+ self._ms_file_type,
+ process_count=self.ms_loader_thread_num,
+ )
+ if raw_data is None:
+ return
+
+ psm_origin_len = len(psm_df_one_raw) // self.max_spec_per_query
+
+ grouper = NormalDIAGrouper(raw_data)
+
+ psm_groups = grouper.assign_dia_groups(
+ psm_df_one_raw.precursor_mz.values[:psm_origin_len]
+ )
+
+ all_spec_idxes = np.full(len(psm_df_one_raw), -1, dtype=np.int32)
+
+ for dia_group, group_df in grouper.dia_group_dfs:
+ psm_idxes = psm_groups[dia_group]
+ if len(psm_idxes) == 0:
+ continue
+ psm_idxes = np.array(psm_idxes, dtype=np.int32)
+ spec_idxes = find_dia_spec_idxes_same_window(
+ group_df.rt.values,
+ psm_df_one_raw.rt.values[psm_idxes],
+ max_spec_per_query=self.max_spec_per_query,
+ )
+ for i in range(spec_idxes.shape[-1]):
+ all_spec_idxes[psm_idxes + psm_origin_len * i] = spec_idxes[:, i]
+
+ match_one_raw_with_numba(
+ all_spec_idxes,
+ psm_df_one_raw.frag_start_idx.values,
+ psm_df_one_raw.frag_stop_idx.values,
+ self.fragment_mz_df.values,
+ self.all_frag_mz_tols,
+ raw_data.peak_df.mz.values,
+ raw_data.peak_df.intensity.values,
+ group_df.peak_start_idx.values,
+ group_df.peak_stop_idx.values,
+ self.matched_intensity_df.values,
+ self.matched_mz_err_df.values,
+ self.match_closest,
+ )
+ else:
+ print(f"`{raw_name}` is not found in ms_file_dict.")
+ return
+ return psm_df_one_raw
+
+ def match_ms2_multi_raw(
+ self,
+ psm_df: pd.DataFrame,
+ ms_files: Tuple[dict, list],
+ ms_file_type: str = "alpharaw_hdf",
+ process_num: int = 8,
+ ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+ """
+ Match peaks for the given `psm_df` against the corresponding MS spectrum files.
+
+ Parameters
+ ----------
+ psm_df : pd.DataFrame
+ Peptide-spectrum matches in alphabase dataframe format.
+ ms_files : Tuple[dict, list]
+ The absolute or relative paths of MS files.
+ if the type is `dict`, the format will be
+ `{'raw_name1': 'raw_name1.raw', ...}` if `ms_file_type` is `thermo_raw`.
+ ms_file_type : str, optional
+ MS file type that is already registered in
+ :obj:`alpharaw.ms_data_base.ms_reader_provider`.
+ By default "alpharaw_hdf".
+ process_num : int, optional
+ Match peaks by using multiprocessing, by default 8
+
+ Returns
+ -------
+ Tuple
+ pd.DataFrame: the `psm_df`.
+
+ pd.DataFrame: fragment m/z dataframe in alphabase format.
+
+ pd.DataFrame: the matched fragment intensity dataframe in alphabase format.
+
+ pd.DataFrame: the matched mass error in the same dataframe format.
+ """
+ if isinstance(ms_files, list):
+ ms_files = parse_ms_files_to_dict(ms_files)
+ psm_df = psm_df[psm_df.raw_name.isin(ms_files)].reset_index(drop=True)
+ super().match_ms2_multi_raw(psm_df, ms_files, ms_file_type, process_num)
+
+ return (
+ self.psm_df,
+ self.fragment_mz_df,
+ self.matched_intensity_df,
+ self.matched_mz_err_df,
+ )
diff --git a/peptdeep/match/utils.py b/peptdeep/match/utils.py
new file mode 100644
index 00000000..4ed29727
--- /dev/null
+++ b/peptdeep/match/utils.py
@@ -0,0 +1,138 @@
+"""Utilities for psm matching. Moved from alpharaw."""
+
+from typing import Union
+
+import numba
+import numpy as np
+
+from alpharaw.match.match_utils import match_closest_peaks, match_highest_peaks
+from alpharaw.ms_data_base import MSData_Base, ms_reader_provider
+
+
+@numba.jit(nogil=True)
+def match_one_raw_with_numba(
+ spec_idxes: np.ndarray,
+ frag_start_idxes: np.ndarray,
+ frag_stop_idxes: np.ndarray,
+ all_frag_mzs: np.ndarray,
+ all_frag_mz_tols: np.ndarray,
+ all_spec_mzs: np.ndarray,
+ all_spec_intensities: np.ndarray,
+ peak_start_idxes: np.ndarray,
+ peak_stop_idxes: np.ndarray,
+ matched_intensities: np.ndarray,
+ matched_mz_errs: np.ndarray,
+ match_closest: bool = True,
+) -> None:
+ """
+ Internel function to match fragment mz values to spectrum mz values.
+ Matched_mz_errs[i] = np.inf if no peaks are matched.
+
+ Results will saved in place of matched_intensities
+ and matched_mz_errs.
+ """
+ for spec_idx, frag_start, frag_stop in zip(
+ spec_idxes, frag_start_idxes, frag_stop_idxes
+ ):
+ if spec_idx == -1:
+ continue
+ peak_start = peak_start_idxes[spec_idx]
+ peak_stop = peak_stop_idxes[spec_idx]
+ if peak_stop == peak_start:
+ continue
+ spec_mzs = all_spec_mzs[peak_start:peak_stop]
+ spec_intens = all_spec_intensities[peak_start:peak_stop]
+
+ frag_mzs = all_frag_mzs[frag_start:frag_stop, :].copy()
+ frag_mz_tols = all_frag_mz_tols[frag_start:frag_stop, :].copy()
+
+ if match_closest:
+ matched_idxes = match_closest_peaks(
+ spec_mzs, spec_intens, frag_mzs, frag_mz_tols
+ ).reshape(-1)
+ else:
+ matched_idxes = match_highest_peaks(
+ spec_mzs, spec_intens, frag_mzs, frag_mz_tols
+ ).reshape(-1)
+
+ matched_intens = spec_intens[matched_idxes]
+ matched_intens[matched_idxes == -1] = 0
+
+ matched_mass_errs = np.abs(
+ spec_mzs[matched_idxes.reshape(-1)] - frag_mzs.reshape(-1)
+ )
+ matched_mass_errs[matched_idxes == -1] = np.inf
+
+ matched_intensities[frag_start:frag_stop, :] = matched_intens.reshape(
+ frag_mzs.shape
+ )
+
+ matched_mz_errs[frag_start:frag_stop, :] = matched_mass_errs.reshape(
+ frag_mzs.shape
+ )
+
+
+def load_ms_data(
+ ms_file: Union[str, MSData_Base],
+ ms_file_type: str = "alpharaw_hdf",
+ process_count: int = 8,
+) -> MSData_Base:
+ """
+ Load MS file and get `MSData_Base` object.
+
+ Parameters
+ ----------
+ ms_file : str | MSData_Base
+ ms2 file path
+
+ ms_file_type : str, optional
+ ms2 file type, can be
+ ["alpharaw_hdf", "thermo", "sciex", "alphapept_hdf", "mgf"].
+ Default to 'alpharaw_hdf'.
+
+ Returns
+ -------
+ MSData_Base:
+ Instance of sub-class of `MSData_Base`.
+ """
+ if isinstance(ms_file, MSData_Base):
+ return ms_file
+ else:
+ raw_data = ms_reader_provider.get_reader(
+ ms_file_type, process_count=process_count
+ )
+ if raw_data is None:
+ print(
+ f"Raw file type '{ms_file_type}' is not registered in 'ms_reader_provider'"
+ )
+ return
+ raw_data.import_raw(ms_file)
+ return raw_data
+
+
+@numba.njit
+def get_ion_count_scores(
+ frag_mz_values: np.ndarray,
+ matched_intens: np.ndarray,
+ frag_start_idxes: np.ndarray,
+ frag_stop_idxes: np.ndarray,
+ min_mz: float = 200,
+):
+ """
+ TODO Deprecated
+ """
+ scores = []
+ for i in range(len(frag_start_idxes)):
+ scores.append(
+ np.count_nonzero(
+ matched_intens[frag_start_idxes[i] : frag_stop_idxes[i], :]
+ .copy()
+ .reshape(-1)[
+ frag_mz_values[frag_start_idxes[i] : frag_stop_idxes[i], :]
+ .copy()
+ .reshape(-1)
+ >= min_mz
+ ]
+ )
+ )
+ return np.array(scores, np.int32)
diff --git a/peptdeep/pipeline_api.py b/peptdeep/pipeline_api.py
index 03359d8e..8812d4e2 100644
--- a/peptdeep/pipeline_api.py
+++ b/peptdeep/pipeline_api.py
@@ -23,17 +23,14 @@
from peptdeep.pretrained_models import ModelManager
-# from peptdeep.rescore.feature_extractor import match_one_raw
-from alpharaw.match.psm_match import (
- PepSpecMatch,
- PepSpecMatch_DIA,
- parse_ms_files_to_dict,
- get_ion_count_scores,
-)
from peptdeep.model.ms2 import calc_ms2_similarity
import peptdeep.model.rt as rt_module
+from peptdeep.match.psm_match import PepSpecMatch_DIA, PepSpecMatch
+from peptdeep.match.utils import get_ion_count_scores
+from alpharaw.utils.ms_path_utils import parse_ms_files_to_dict
+
DIA_max_spec_per_query = 3
DIA_min_ion_count = 6
DIA_min_frag_mz = 200.0
@@ -108,10 +105,10 @@ def get_median_pccs_for_dia_psms(
-----
The `psm_df` contains `max_spec_per_query` copies per peptide, each matched against
a different MS2 scan. The PSMs need to be sorted by spec index before processing because
- `alpharaw.match.psm_match.PepSpecMatch.match_ms2_multi_raw`
+ `peptdeep.match.psm_match.PepSpecMatch.match_ms2_multi_raw`
orders them by `raw_name` when multiple MS files are used.
- See also (alpharaw.match.psm_match):
+ See also (peptdeep.match.psm_match):
- `PepSpecMatch_DIA._prepare_matching_dfs`: creates replicated PSM/fragment structure
- `PepSpecMatch_DIA._match_ms2_one_raw_numba`: finds nearby MS2 scans and extracts fragments
- `PepSpecMatch.match_ms2_multi_raw`: processes multiple MS files (reorders by raw_name)
diff --git a/peptdeep/utils/__init__.py b/peptdeep/utils/__init__.py
index 3e98499e..14a7e50c 100644
--- a/peptdeep/utils/__init__.py
+++ b/peptdeep/utils/__init__.py
@@ -12,7 +12,6 @@
import numpy as np
-# from alphatims
def process_bar(iterator, len_iter):
with tqdm.tqdm(total=len_iter) as bar:
i = 0
diff --git a/peptdeep/utils/logger.py b/peptdeep/utils/logger.py
index 3e90ee19..91cf48cc 100644
--- a/peptdeep/utils/logger.py
+++ b/peptdeep/utils/logger.py
@@ -38,7 +38,7 @@ def set_logger(
The file name to where the log is written.
Folders are automatically created if needed.
This is relative to the current path. When an empty string is provided,
- a log is written to the AlphaTims "logs" folder with the name
+ a log is written to the PeptDeep "logs" folder with the name
"log_yymmddhhmmss" (reversed timestamp year to seconds).
If None, no log file is saved.
Default is "".
@@ -158,7 +158,7 @@ def show_python_info() -> None:
This is done in the following format:
```
- [timestamp]> Python information:
- - [timestamp]> alphatims - [current_version]
+ - [timestamp]> peptdeep - [current_version]
- [timestamp]> [required package] - [current_version]
- ...
- [timestamp]> [required package] - [current_version]
diff --git a/tests/run_tests.sh b/tests/run_tests.sh
index 6efaf9e7..14afcb6d 100755
--- a/tests/run_tests.sh
+++ b/tests/run_tests.sh
@@ -1,4 +1,6 @@
-TEST_NBS=$(find ../nbs_tests -name "*.ipynb")
+# ignore notebooks that requires local data
+TEST_NBS=$(find ../nbs_tests -name "*.ipynb" | grep -v "test_dia_matching.ipynb")
+
TUTORIAL_NBS=$(find ../docs/tutorials -name "*.ipynb")
UNIT_TESTS=$(find ../tests/unit -name "test_*.py")
ALL_TESTS=$(echo $TEST_NBS$'\n'$TUTORIAL_NBS$'\n'$UNIT_TESTS)