From 4d4caf4f443467246668b7f6469345eb2b129a8f Mon Sep 17 00:00:00 2001
From: Ralf Weber <RJMW@users.noreply.github.com>
Date: Sun, 5 Jul 2020 00:12:32 +0100
Subject: [PATCH] Improve performance sqlite queries and enrich ranking summary
 (#16)

* Optimise queries and add additional indexes
* Enrich ranking outputs
* Update CLI
* Fix tests
---
 msnpy/__main__.py                           |  13 +-
 msnpy/annotation.py                         | 226 +++++++++++++-------
 tests/data/ranks_21-hydroxyprogesterone.txt |  14 +-
 tests/test_annotation.py                    |   2 +-
 4 files changed, 171 insertions(+), 84 deletions(-)

diff --git a/msnpy/__main__.py b/msnpy/__main__.py
index 4cb7e94..f37dee7 100644
--- a/msnpy/__main__.py
+++ b/msnpy/__main__.py
@@ -224,11 +224,16 @@ def main(): # pragma: no cover
 
     parser_ast.add_argument('-f', '--filter',
                             action='store_true', required=False,
-                            help="Filter the spectral tree annotations.")
+                            help="Filter the spectral tree annotations using fragmentation consistency rules.")
+
+    parser_ast.add_argument('-n', '--remove-nodes',
+                            action='store_true', required=False,
+                            help="Remove fragments from that have not been annotated a molecular formula (after filtering).")
 
     parser_ast.add_argument('-t', '--time-limit',
-                            default=None, type=int, required=False,
-                            help="Time limit (seconds) for each tree to be processed for annotation")
+                            default=0, type=int, required=False,
+                            help="Time limit (seconds) for each tree to be processed for annotation."
+                                 "Set to 0 to not apply a time limit (default).")
 
     #################################
     # RANK SPECTRAL TREES
@@ -383,7 +388,7 @@ def main(): # pragma: no cover
                          time_limit=args.time_limit)
 
         if args.filter:
-            st = filter_mf(st, args.output_db, args.time_limit)
+            st = filter_mf(st, args.output_db, args.remove_nodes, args.time_limit)
         save_trees(st, args.output_trees, format="json")
 
     if args.step == "rank-spectral-trees":
diff --git a/msnpy/annotation.py b/msnpy/annotation.py
index c31d168..d13a48e 100644
--- a/msnpy/annotation.py
+++ b/msnpy/annotation.py
@@ -19,15 +19,19 @@
 # along with MSnPy.  If not, see <https://www.gnu.org/licenses/>.
 #
 import signal
+import copy
 import collections
+import re
 import sqlite3
 import time
 from typing import Sequence
+import numpy as np
 import networkx as nx
 import pandas as pd
 import requests
 from .processing import mz_tol, mz_pair_diff_tol
 
+
 def signal_handler(signum, frame):
     raise TimeoutException()
 
@@ -36,6 +40,7 @@ class TimeoutException(Exception):
     def __init__(self, *args, **kwargs):
         pass
 
+
 class ApiMfdb:
 
     def __init__(self, url="https://mfdb.bham.ac.uk"):
@@ -92,9 +97,10 @@ def select_mf(self, min_tol: float, max_tol: float, adducts: dict = None, rules:
 
 def annotate_mf(spectral_trees: Sequence[nx.classes.ordered.OrderedDiGraph], db_out: str, ppm: float,
                 adducts: dict = {"[M+H]+": 1.0072764}, rules: bool = True, mf_db: str = "http://mfdb.bham.ac.uk",
-                prefix_inp: str = "", time_limit: int = ''):
+                prefix_inp: str = "", time_limit: int = 0):
 
     for G in spectral_trees:
+
         signal.signal(signal.SIGALRM, signal_handler)
         if time_limit:
             signal.alarm(int(time_limit))  # Time Limit
@@ -111,6 +117,7 @@ def annotate_mf(spectral_trees: Sequence[nx.classes.ordered.OrderedDiGraph], db_
 
 def annotate_mf_single(G, db_out, ppm: float, adducts: dict = {"[M+H]+": 1.0072764}, rules: bool = True,
                        mf_db: str = "http://mfdb.bham.ac.uk", prefix_inp: str = ""):
+
     conn = sqlite3.connect(db_out)
     cursor = conn.cursor()
 
@@ -190,7 +197,7 @@ def annotate_mf_single(G, db_out, ppm: float, adducts: dict = {"[M+H]+": 1.00727
 
             if len(records_mf) > 0:
                 for mf in records_mf:
-                    ppm_error = round((mf["mass"] - edge[2]['mzdiff']) / (mf["mass"] * 0.000001), 2)
+                    ppm_error = round((edge[2]['mzdiff'] - mf["mass"]) / (mf["mass"] * 0.000001), 2)
                     values = ("{}__{}".format(edge[0], edge[1]), mf_id,
                               mf["atoms"]["C"], mf["atoms"]["H"],
                               mf["atoms"]["N"], mf["atoms"]["O"],
@@ -207,12 +214,6 @@ def annotate_mf_single(G, db_out, ppm: float, adducts: dict = {"[M+H]+": 1.00727
                     edge[2]['mzdiff'], None, "{}_{}".format(node_i["mslevel"], node_j["mslevel"]),
                     None, None, None, None, None, 0,)
                 rows.append(values)
-                ####################################
-                # else:
-                #    values = ("{}__{}".format(edge[0], edge[1]), None, None, None, None, None, None, None, None, None, None,
-                #              edge[2]['mzdiff'], None, "{}_{}".format(node_i["mslevel"], node_j["mslevel"]),
-                #              None, None, None, None, None, 0,)
-                #    rows.append(values)
 
     cursor.executemany("""
     INSERT INTO MF{}
@@ -220,6 +221,12 @@ def annotate_mf_single(G, db_out, ppm: float, adducts: dict = {"[M+H]+": 1.00727
     VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
     """.format(prefix), rows)
 
+    cursor.execute("""
+    CREATE INDEX MF_ID_{}_IDX
+        ON MF{} (MF_ID);""".format(prefix, prefix))
+    cursor.execute("""
+    CREATE INDEX MF_ID_F_{}_IDX
+        ON MF{} (MF_ID, FLAG);""".format(prefix, prefix))
     cursor.execute("""
     CREATE INDEX CHNOPS_ADDUCT_{}_IDX
         ON MF{} (MZ_ID, PRECURSOR, ADDUCT, C, H, N, O, P, S, FLAG);""".format(prefix, prefix))
@@ -227,8 +234,8 @@ def annotate_mf_single(G, db_out, ppm: float, adducts: dict = {"[M+H]+": 1.00727
     CREATE INDEX MZ_ID_{}_IDX
         ON MF{} (MZ_ID);""".format(prefix, prefix))
     cursor.execute("""
-    CREATE INDEX MF_MSL_P_F_{}_IDX
-        ON MF{} (MSLEVEL, PRECURSOR, FLAG)""".format(prefix, prefix))
+    CREATE INDEX MF_F_P_MSL_{}_IDX
+        ON MF{} (FLAG, PRECURSOR, MSLEVEL)""".format(prefix, prefix))
     cursor.execute("""
     CREATE INDEX MF_FLAG_{}_IDX
         ON MF{} (FLAG)""".format(prefix, prefix))
@@ -236,7 +243,7 @@ def annotate_mf_single(G, db_out, ppm: float, adducts: dict = {"[M+H]+": 1.00727
     conn.close()
 
 
-def mf_tree(G: nx.classes.ordered.OrderedDiGraph, path_db: str, max_mslevel: int, prefix: str):
+def mf_tree(G: nx.classes.ordered.OrderedDiGraph, path_db: str, max_mslevel: int, remove: bool = True, prefix: str = ""):
 
     conn = sqlite3.connect(path_db)
     cursor = conn.cursor()
@@ -272,6 +279,7 @@ def mf_tree(G: nx.classes.ordered.OrderedDiGraph, path_db: str, max_mslevel: int
         records = cursor.fetchall()
 
         GG = G.copy()
+
         atoms = ["C", "H", "N", "O", "P", "S"]
         for record in records:
 
@@ -302,56 +310,65 @@ def mf_tree(G: nx.classes.ordered.OrderedDiGraph, path_db: str, max_mslevel: int
                 for mf in cursor.fetchall():
                     n = mf[0].split("__")
                     if "mf" not in GG[n[0]][n[1]]:
-                        GG[n[0]][n[1]]["mf"] = {}
+                       GG[n[0]][n[1]]["mf"] = {}
                     if mf[1] not in GG[n[0]][n[1]]["mf"]:
                         GG[n[0]][n[1]]["mf"][str(mf[1])] = {"mass": float(mf[9]), "mf": print_formula(collections.OrderedDict(zip(atoms, mf[2:8])))}
 
         #print "N:", GG.number_of_nodes(), "E:", GG.number_of_edges()
-        nodes = GG.copy().nodes(data=True)
-        for node in nodes:
-            if "mf" not in node[1]:
-                GG.remove_node(node[0])
-        GG.graph["id"] = "{}_{}".format(GG.graph["id"], mf_prec[0])
+        if remove:
 
+            nodes = copy.deepcopy(GG.nodes(data=True))
+            for node in nodes:
+                if "mf" not in node[1]:
+                    GG.remove_node(node[0])
+
+        GG.graph["id"] = "{}_{}".format(GG.graph["id"], mf_prec[0])
         mft.append(GG)
 
+    if not mfs_precs:
+        mft.append(G)
+
     conn.close()
     return mft
 
 
-def filter_mf(trees: Sequence[nx.classes.ordered.OrderedDiGraph], path_db: str, time_limit: int = ''):
-
-    #http://www.sqlstyle.guide/
+def filter_mf(trees: Sequence[nx.classes.ordered.OrderedDiGraph], path_db: str, remove: bool = True, time_limit: int = 0):
 
-    print('filter mf')
+    # http://www.sqlstyle.guide/
 
     annotated_trees = []
 
     for G in trees:
+
         signal.signal(signal.SIGALRM, signal_handler)
         if time_limit:
             signal.alarm(time_limit)
-
         try:
-            filtered_tree = filter_mf_single_tree(G, path_db)
+            filtered_tree = filter_mf_single_tree(G, path_db, remove)
             if filtered_tree:
                 annotated_trees.extend(filtered_tree)
         except TimeoutException as e:
-            print("Time out ", e)
+            annotated_trees.extend([G])
+            print("Time out for spectral tree: {}".format(G.graph["id"]), e)
 
         if time_limit:
             signal.alarm(0)
 
     return annotated_trees
 
-def filter_mf_single_tree(G, path_db):
+
+# @profile
+def filter_mf_single_tree(G, path_db, remove):
     conn = sqlite3.connect(path_db)
     cursor = conn.cursor()
+
+    cursor.execute("PRAGMA synchronous = OFF")
+    cursor.execute("PRAGMA journal_mode = OFF")
+
     atoms = ["C", "H", "N", "O", "P", "S"]
     start_time = time.time()
 
     prefix = "_{}".format(G.graph["id"])
-    print("Filtering group {}".format(G.graph["id"]))
 
     ################################################################
     # TABLES: EDGES AND MZ_PREC_FRAG
@@ -370,20 +387,10 @@ def filter_mf_single_tree(G, path_db):
         MZ_ID_FRAG      TEXT,
         MF_ID_PREC_FLAG INTEGER,
         MF_ID_NL_FLAG   INTEGER,
-        MF_ID_FRAG_FLAG INTERGER,
+        MF_ID_FRAG_FLAG INTEGER,
         PRIMARY KEY(MF_ID_PREC, MF_ID_NL, MF_ID_FRAG)
     )""".format(prefix))
 
-    cursor.execute("""
-    CREATE INDEX EDGES_IDS{}_IDX
-        ON EDGES{} (MF_ID_PREC, MF_ID_NL, MF_ID_FRAG);
-    """.format(prefix, prefix))
-
-    cursor.execute("""
-    CREATE INDEX EDGES_FLAGS{}_IDX
-        ON EDGES{} (MF_ID_PREC_FLAG, MF_ID_NL_FLAG, MF_ID_FRAG_FLAG)
-    """.format(prefix, prefix))
-
     cursor.execute("""DROP TABLE IF EXISTS MZ_PREC_FRAG{}""".format(prefix))
 
     cursor.execute("""
@@ -402,8 +409,8 @@ def filter_mf_single_tree(G, path_db):
 
     improvement = [cursor.fetchone()[0]]
 
-    if G.number_of_nodes() == 0:
-        return ''
+    # if G.number_of_nodes() == 0:
+    #    return ""
 
     # Insert all inital edges
     for edge in [edge for edge in G.edges(data=True) if edge[2]["type"] == "e" and edge[2]["mzdiff"] > 0.5]:
@@ -412,7 +419,8 @@ def filter_mf_single_tree(G, path_db):
         mz_id_prec, mz_id_frag, mz_id_nl = edge[0], edge[1], "{}__{}".format(edge[0], edge[1])
 
         sub_queries = ['MF1.MZ_ID = "{}" AND MF2.MZ_ID = "{}" AND MF3.MZ_ID = "{}" '
-                       'AND MF1.PRECURSOR = 1 AND MF1.ADDUCT = MF3.ADDUCT'.format(mz_id_prec, mz_id_nl, mz_id_frag),
+                       'AND MF1.PRECURSOR = 1 AND MF2.PRECURSOR IS NULL '
+                       'AND MF1.ADDUCT = MF3.ADDUCT AND MF2.ADDUCT IS NULL'.format(mz_id_prec, mz_id_nl, mz_id_frag),
                        'MF1.FLAG >= 0 AND MF2.FLAG >= 0 AND MF3.FLAG >= 0']
 
         # Number of C in the fragment MF should be smaller or equal to the number of C in the precursor MF
@@ -427,12 +435,33 @@ def filter_mf_single_tree(G, path_db):
         INSERT INTO MZ_PREC_FRAG{} (MZ_ID_PREC, MZ_ID_FRAG)
         VALUES ('{}','{}')""".format(prefix, mz_id_prec, mz_id_frag))
 
+        # cursor.execute("""
+        #         EXPLAIN QUERY PLAN
+        #                SELECT MF1.MF_ID, MF2.MF_ID, MF3.MF_ID, MF1.MASS, MF2.MASS, MF3.MASS, MF1.MZ_ID, MF2.MZ_ID, MF3.MZ_ID, 0, 0, 0
+        #                  FROM MF{} as MF1, MF{} as MF2, MF{} as MF3
+        #                 WHERE {}
+        #         """.format(prefix, prefix, prefix, " AND ".join(map(str, sub_queries))))
+        # print(cursor.fetchall())
+
         cursor.execute("""
         INSERT INTO EDGES{}
                SELECT MF1.MF_ID, MF2.MF_ID, MF3.MF_ID, MF1.MASS, MF2.MASS, MF3.MASS, MF1.MZ_ID, MF2.MZ_ID, MF3.MZ_ID, 0, 0, 0
                  FROM MF{} as MF1, MF{} as MF2, MF{} as MF3
                 WHERE {}
         """.format(prefix, prefix, prefix, prefix, " AND ".join(map(str, sub_queries))))
+
+    conn.commit()
+
+    cursor.execute("""
+    CREATE INDEX EDGES_IDS{}_IDX
+        ON EDGES{} (MF_ID_PREC, MF_ID_NL, MF_ID_FRAG);
+    """.format(prefix, prefix))
+
+    cursor.execute("""
+    CREATE INDEX EDGES_FLAGS{}_IDX
+        ON EDGES{} (MF_ID_PREC_FLAG, MF_ID_NL_FLAG, MF_ID_FRAG_FLAG)
+    """.format(prefix, prefix))
+
     conn.commit()
 
     cursor.execute("""
@@ -478,8 +507,6 @@ def filter_mf_single_tree(G, path_db):
                        prefix, prefix))  # AND MF1.FLAG >= 1 AND MF1.FLAG > 1  AND MF1.FLAG >= 1;
         conn.commit()
 
-        conn.commit()
-
         # Update flag MF table based on the flag in table edges and the number of loops
         values = (loop, loop, loop, loop,)
         cursor.execute("""
@@ -619,18 +646,13 @@ def filter_mf_single_tree(G, path_db):
         """.format(prefix))
         improvement.append(cursor.fetchone()[0])
 
-    print("MF AFTER CONSTRAINS: ", ", ".join(map(str, improvement)))
+    # print("MF AFTER CONSTRAINS: ", ", ".join(map(str, improvement)))
 
-    mf_tree_out = mf_tree(G, path_db, max_mslevel, prefix)
+    mf_tree_out = mf_tree(G, path_db, max_mslevel, remove, prefix)
 
     conn.close()
 
-    if mf_tree_out:
-        return mf_tree_out
-    else:
-        return ''
-
-
+    return mf_tree_out
 
 
 def print_formula(atom_counts: dict):
@@ -651,41 +673,101 @@ def print_formula(atom_counts: dict):
 
 def rank_mf(trees: Sequence[nx.classes.ordered.OrderedDiGraph], rank_threshold: int = 0):
 
-    columns = ["TreeID", 'GroupID', 'MolecularFormulaID', 'MolecularFormula', 'Adduct', 'Rank', 'TotalRanks', 'RankedEqual', 'Trees', 'NeutralLossesExplained']
+    columns = ["tree_id", "group_id", "mz", "scan_events", "max_mslevel",
+               "mf_id", "molecular_formula", "adduct", "mass", "ppm_error",
+               "rank", "total_ranks", "ranked_equal", "trees",
+               "neutral_losses_explained"] #, "average_number_mf_edge"]
     df = pd.DataFrame(columns=columns)
 
+    def _scan_events(headers):
+        scan_events = ["ms1"]
+        for header in headers:
+            info = re.findall(r'(ms\d+) ([\w\.-]+)@([a-zA-Z]+\d+\.\d+)', header)
+            if len(info) >= 1:
+                for match in info:
+                    info_str = "{}@{}".format(match[0], match[2])
+                    if info_str not in scan_events:
+                        scan_events.append(info_str)
+        return scan_events
+
     annotated_trees = collections.OrderedDict()
     for graph in trees:
-        tree_id = graph.graph["id"].split("_")
-        annotated_trees.setdefault(tree_id[0], []).append(graph)
 
-    for i, graphs in enumerate(annotated_trees.values()):
+        if isinstance(graph.graph["id"], int) and "_" not in str(graph.graph["id"]):
+            group_id = str(graph.graph["id"])  # Tree without mf annotations
+        else:
+            group_id = graph.graph["id"].split("_")[0]
+        annotated_trees.setdefault(group_id, []).append(graph)
 
-        if len(graphs) == 0:
-            continue
+    for i, graphs in enumerate(annotated_trees.values()):
 
         df_subset = pd.DataFrame(columns=columns)
-        for graph in graphs:
-            mf_id = graph.graph["id"].split("_")[1]
-            group_id = graph.graph["id"].split("_")[0]
-            mf = str(graph.nodes[list(graph.nodes())[0]]["mf"][str(mf_id)]["mf"])
-            adduct = str(graph.nodes[list(graph.nodes())[0]]["mf"][str(mf_id)]["adduct"])
-            # print(list(graph.nodes(data=True))[0])
-            values = [graph.graph["id"], group_id, mf_id, mf, adduct, 0, 0, 0, len(graphs), graph.number_of_edges()]#, mf_str, ion_str]
+
+        if len(graphs) == 1 and isinstance(graphs[0].graph["id"], int):
+            max_mslevel = max(nx.get_node_attributes(graph, 'mslevel').values())
+            headers = list(nx.get_node_attributes(graph, 'header').values())
+            scan_events = ",".join(map(str, _scan_events(headers)))
+            prec_node = list(graphs[0].nodes(data=True))[0]
+            if prec_node[1]["precursor"] and prec_node[1]["mslevel"] == 1:
+                mz = prec_node[1]["mz"]
+            else:
+                mz = np.nan
+            values = [str(graphs[0].graph["id"]), graphs[0].graph["id"], mz, scan_events, max_mslevel,
+                      np.nan, np.nan, np.nan, np.nan, np.nan,
+                      0, 0, 0, len(graphs),
+                      0]
             d = collections.OrderedDict(zip(columns, values))
             df_subset = df_subset.append(d, ignore_index=True)
-
-        df_subset["Rank"] = df_subset['NeutralLossesExplained'].rank(method='dense', ascending=False).astype(int)
-        df_subset['RankedEqual'] = df_subset.groupby('NeutralLossesExplained')['NeutralLossesExplained'].transform('count')
-        df_subset['TotalRanks'] = df_subset['Rank'].nunique()
-        df_subset = df_subset.sort_values(by=['Rank', 'MolecularFormulaID'])
-        df = pd.concat([df, df_subset], ignore_index=True)
+            df = pd.concat([df, df_subset], ignore_index=True)
+        else:
+            for graph in graphs:
+                l = []
+                for e in list(graph.edges(data=True)):
+                    l.append(len(e[2]["mf"]))
+                    print(len(e[2]["mf"]))
+                avg_mf_per_edge = sum(l)/float(len(l))
+
+                mf_group_id = graph.graph["id"].split("_")
+                mf_id = int(mf_group_id[1])
+                group_id = int(mf_group_id[0])
+                max_mslevel = max(nx.get_node_attributes(graph, 'mslevel').values())
+                headers = list(nx.get_node_attributes(graph, 'header').values())
+                scan_events = ",".join(map(str, _scan_events(headers)))
+                prec_node = list(graph.nodes())[0]
+                mz = graph.nodes[prec_node]["mz"]
+                mf = str(graph.nodes[prec_node]["mf"][str(mf_id)]["mf"])
+                adduct = str(graph.nodes[prec_node]["mf"][str(mf_id)]["adduct"])
+                exact_mass = graph.nodes[prec_node]["mf"][str(mf_id)]["mass"]
+                ppm_error = float(mz - exact_mass) / (exact_mass * 0.000001)
+                values = [graph.graph["id"], group_id, mz, scan_events, max_mslevel,
+                          mf_id, mf, adduct, exact_mass, ppm_error,
+                          0, 0, 0, len(graphs),
+                          graph.number_of_edges()]
+                d = collections.OrderedDict(zip(columns, values))
+                df_subset = df_subset.append(d, ignore_index=True)
+
+            df_subset["rank"] = df_subset['neutral_losses_explained'].rank(method='dense', ascending=False).astype(int)
+            df_subset['ranked_equal'] = df_subset.groupby('neutral_losses_explained')['neutral_losses_explained'].transform('count')
+            df_subset['total_ranks'] = df_subset['rank'].nunique()
+            df_subset = df_subset.sort_values(by=['rank', 'mf_id'])
+            df = pd.concat([df, df_subset], ignore_index=True)
+
+    # Workaround for Pandas casting INT fo Float when Nan is present
+    if df.isnull().sum().sum() > 0:
+        for c in columns:
+            if c not in ["tree_id", "mz", "scan_events",
+                         "molecular_formula", "adduct", "mass", "ppm_error"]:
+                df[c] = df[c].astype('Int64')
 
     for G in trees:
-        G.graph["rank"] = int(df[df["TreeID"] == G.graph["id"]]["Rank"])
-        G.graph["mf_id"] =  int(df[df["TreeID"] == G.graph["id"]]["MolecularFormulaID"])
+        G.graph["rank"] = int(df[df["tree_id"] == str(G.graph["id"])]["rank"].iloc[0])
+        G.graph["mf_id"] = df[df["tree_id"] == str(G.graph["id"])]["mf_id"].iloc[0]
+        if pd.isna(G.graph["mf_id"]):
+            G.graph["mf_id"] = None
+        else:
+            G.graph["mf_id"] = int(G.graph["mf_id"])
 
-    trees_ranked = sorted(trees, key=lambda i: (int(i.graph['id'].split("_")[0]), i.graph['rank'], i.graph["mf_id"]))
+    trees_ranked = sorted(trees, key=lambda i: (int(str(i.graph['id']).split("_")[0]), i.graph['rank'], i.graph["mf_id"]))
     if rank_threshold > 0:
         trees_ranked = [tree for tree in trees_ranked if tree.graph["rank"] <= rank_threshold]
 
diff --git a/tests/data/ranks_21-hydroxyprogesterone.txt b/tests/data/ranks_21-hydroxyprogesterone.txt
index 40e2ed0..d5d9a4b 100644
--- a/tests/data/ranks_21-hydroxyprogesterone.txt
+++ b/tests/data/ranks_21-hydroxyprogesterone.txt
@@ -1,7 +1,7 @@
-TreeID	GroupID	MolecularFormulaID	MolecularFormula	Adduct	Rank	TotalRanks	RankedEqual	Trees	NeutralLossesExplained
-1_6	1	6	C21H30O3	[M+H]+	1	5	1	6	36
-1_2	1	2	C15H31N4O2P	[M+H]+	2	5	2	6	20
-1_4	1	4	C18H34O3S	[M+H]+	2	5	2	6	20
-1_5	1	5	C20H31N2P	[M+H]+	3	5	1	6	19
-1_3	1	3	C17H26N6O	[M+H]+	4	5	1	6	17
-1_1	1	1	C14H30N6OS	[M+H]+	5	5	1	6	8
+tree_id	group_id	mz	scan_events	max_mslevel	mf_id	molecular_formula	adduct	mass	ppm_error	rank	total_ranks	ranked_equal	trees	neutral_losses_explained
+1_6	1	331.22706604003906	ms1,ms2@cid35.00,ms3@cid35.00	3	6	C21H30O3	[M+H]+	331.2267714	0.8895417414445896	1	5	1	6	36
+1_2	1	331.22706604003906	ms1,ms2@cid35.00,ms3@cid35.00	3	2	C15H31N4O2P	[M+H]+	331.2257394	4.005244403579925	2	5	2	6	20
+1_4	1	331.22706604003906	ms1,ms2@cid35.00	2	4	C18H34O3S	[M+H]+	331.23014240000003	-9.287681183484377	2	5	2	6	20
+1_5	1	331.22706604003906	ms1,ms2@cid35.00,ms3@cid35.00	3	5	C20H31N2P	[M+H]+	331.2297624	-8.140451937130637	3	5	1	6	19
+1_3	1	331.22706604003906	ms1,ms2@cid35.00	2	3	C17H26N6O	[M+H]+	331.22408540000004	8.998862614195307	4	5	1	6	17
+1_1	1	331.22706604003906	ms1,ms2@cid35.00	2	1	C14H30N6OS	[M+H]+	331.22745740000005	-1.1815444409649882	5	5	1	6	8
diff --git a/tests/test_annotation.py b/tests/test_annotation.py
index 9b03dcb..62ab576 100644
--- a/tests/test_annotation.py
+++ b/tests/test_annotation.py
@@ -92,7 +92,7 @@ def test_annotate_mf(self):
 
         self.assertTupleEqual(records[0], ('331.2271_0_7', 1, 14, 30, 6, 1, 0, 1, 3, 1, 1, 1, 1, '[M+H]+', 331.22745740000005, 331.22706604003906, -1.18, 1, '1', 0))
         self.assertTupleEqual(records[5], ('331.2271_0_7', 6, 21, 30, 0, 3, 0, 0, 7, 1, 1, 1, 1, '[M+H]+', 331.2267714, 331.22706604003906, 0.89, 1, '1', 0))
-        self.assertTupleEqual(records[-1], ('313.2163_1_31__295.2056_3_0', 583, 0, 2, 0, 1, 0, 0, 0, 1, 1, 0, 1, None, 18.010565, 18.0106583, -5.18, None, '2_3', 0))
+        self.assertTupleEqual(records[-1], ('313.2163_1_31__295.2056_3_0', 583, 0, 2, 0, 1, 0, 0, 0, 1, 1, 0, 1, None, 18.010565, 18.0106583, 5.18, None, '2_3', 0))
 
         cursor.execute("select count(*) from MF_1")
         self.assertEqual(cursor.fetchone()[0], 583)