From 71784a6e30bcc7648e8f0de6cc806a72755e4744 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sat, 30 Nov 2024 19:49:23 +0000 Subject: [PATCH] black applied. --- quantmsutils/mzml/mzml_statistics.py | 44 +++++++++++++++------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/quantmsutils/mzml/mzml_statistics.py b/quantmsutils/mzml/mzml_statistics.py index fbda986..c242312 100644 --- a/quantmsutils/mzml/mzml_statistics.py +++ b/quantmsutils/mzml/mzml_statistics.py @@ -283,8 +283,32 @@ def batch_write_bruker_d(file_name: str, output_path: str, batch_size: int = 100 "SELECT Value FROM GlobalMetadata WHERE key='AcquisitionDateTime'" ).fetchone()[0] + # Check which optional columns exist columns = column_exists(conn, "frames") + # Get allowed columns from the schema + allowed_columns = { + 'Id': 'Id', + 'MsMsType': 'CASE WHEN MsMsType IN (8, 9) THEN 2 WHEN MsMsType = 0 THEN 1 ELSE NULL END', + 'NumPeaks': 'NumPeaks', + 'MaxIntensity': 'MaxIntensity', + 'SummedIntensities': 'SummedIntensities', + 'Time': 'Time', + 'Charge': 'Charge', + 'MonoisotopicMz': 'MonoisotopicMz' + } + + # Construct safe column list + safe_columns = [] + column_mapping = {} + for schema_col_name, sql_expr in allowed_columns.items(): + if schema_col_name in columns or schema_col_name == 'Id': + safe_columns.append(sql_expr) + column_mapping[schema_col_name] = sql_expr + + # Construct the query using parameterized safe columns + query = f"""SELECT {', '.join(safe_columns)} FROM frames""" + schema = pa.schema( [ pa.field("Id", pa.int32(), nullable=False), @@ -302,26 +326,6 @@ def batch_write_bruker_d(file_name: str, output_path: str, batch_size: int = 100 # Set up parquet writer parquet_writer = pq.ParquetWriter(output_path, schema=schema, compression="gzip") - base_columns = [ - "Id", - "CASE WHEN MsMsType IN (8, 9) THEN 2 WHEN MsMsType = 0 THEN 1 ELSE NULL END as MsMsType", - "NumPeaks", - "MaxIntensity", - "SummedIntensities", - "Time", - ] - - if "Charge" in columns: - base_columns.insert(-1, "Charge") # Add before the last column for logical flow - - if "MonoisotopicMz" in columns: - base_columns.insert(-1, "MonoisotopicMz") - - safe_columns = [ - col for col in base_columns if col.replace(" ", "").isalnum() - ] # Remove spaces - query = f"""SELECT {', '.join(safe_columns)} FROM frames """ - try: # Stream data in batches for chunk in pd.read_sql_query(query, conn, chunksize=batch_size):