From b72cd815ba0f90639841e01033d0115a8892d011 Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Sat, 9 Nov 2024 22:32:04 +0100 Subject: [PATCH 01/30] wrapper4insty is added as calcSignatureInteractions() --- prody/proteins/interactions.py | 287 ++++++++++++++++++++++++++++++++- 1 file changed, 286 insertions(+), 1 deletion(-) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index e6918a6cf..c741b426c 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -32,6 +32,8 @@ from prody.ensemble import Ensemble import multiprocessing +import matplotlib.pylab as plt +import os __all__ = ['calcHydrogenBonds', 'calcChHydrogenBonds', 'calcSaltBridges', 'calcRepulsiveIonicBonding', 'calcPiStacking', 'calcPiCation', @@ -47,7 +49,7 @@ 'Interactions', 'InteractionsTrajectory', 'LigandInteractionsTrajectory', 'calcSminaBindingAffinity', 'calcSminaPerAtomInteractions', 'calcSminaTermValues', 'showSminaTermValues', 'showPairEnergy', 'checkNonstandardResidues', - 'saveInteractionsAsDummyAtoms'] + 'saveInteractionsAsDummyAtoms', 'calcSignatureInteractions'] def cleanNumbers(listContacts): @@ -308,6 +310,233 @@ def showPairEnergy(data, **kwargs): return data +# SignatureInteractions supporting functions +from Bio.PDB.Polypeptide import three_to_one +import shutil +import importlib.util + +def remove_empty_strings(row): + """Remove empty strings from a list.""" + return [elem for elem in row if elem != ''] + +def log_message(message, level="INFO"): + """Log a message with a specified log level.""" + print(f"[{level}] {message}") + +def is_module_installed(module_name): + """Check if a Python module is installed.""" + spec = importlib.util.find_spec(module_name) + return spec is not None + +def is_command_installed(command): + """Check if a command-line tool is installed.""" + return shutil.which(command) is not None + +def load_residues_from_pdb(pdb_file): + """Extract residue numbers and their corresponding one-letter amino acid codes from a PDB file.""" + structure = parsePDB(pdb_file) + residues = structure.iterResidues() + residue_dict = {} + + for res in residues: + resnum = res.getResnum() + resname = res.getResname() # Three-letter amino acid code + try: + one_letter_code = three_to_one(resname) # Convert to one-letter code + residue_dict[resnum] = one_letter_code + except KeyError: + log_message(f"Unknown residue: {resname} at position {resnum}", "WARNING") + + return residue_dict + +def append_residue_code(residue_num, residue_dict): + """Return a string with one-letter amino acid code and residue number.""" + aa_code = residue_dict.get(residue_num, "X") # Use "X" for unknown residues + return f"{aa_code}{residue_num}" + +def process_data(mapping_file, pdb_folder, interaction_func, bond_type, fixer): + """Process the mapping file and the PDB folder to compute interaction counts and percentages.""" + log_message(f"Loading mapping file: {mapping_file}") + + # Load and clean the mapping file + try: + mapping = np.loadtxt(mapping_file, delimiter=' ', dtype=str) + except Exception as e: + log_message(f"Error loading mapping file: {e}", "ERROR") + return None + + mapping = np.where(mapping == '-', np.nan, mapping) + filtered_mapping = np.array([remove_empty_strings(row) for row in mapping]) + mapping_num = filtered_mapping.astype(float) + + # Load the one-letter amino acid codes from model1.pdb + pdb_model_path = os.path.join(pdb_folder, 'model1.pdb') + residue_dict = load_residues_from_pdb(pdb_model_path) + + log_message(f"Processing PDB files in folder: {pdb_folder}") + + tar_bond_ind = [] + processed_files = 0 # To track the number of files successfully processed + fixed_files = [] # Store paths of fixed files to remove at the end + + for i, files in enumerate(os.listdir(pdb_folder)): + # Skip any file that has already been processed (files with 'addH_' prefix) + if files.startswith('addH_'): + log_message(f"Skipping already fixed file: {files}", "INFO") + continue + + log_message(f"Processing file {i+1}: {files}") + + pdb_file_path = os.path.join(pdb_folder, files) + fixed_pdb_path = pdb_file_path.replace(files, 'addH_' + files) + + # Check if the fixed file already exists, skip fixing if it does + if not os.path.exists(fixed_pdb_path): + try: + log_message(f"Running fixer on {pdb_file_path} using {fixer}.") + addMissingAtoms(pdb_file_path, method=fixer) + except Exception as e: + log_message(f"Error adding missing atoms: {e}", "ERROR") + continue + else: + log_message(f"Using existing fixed file: {fixed_pdb_path}") + + try: + coords = parsePDB(fixed_pdb_path) + atoms = coords.select('protein') + interactions = Interactions() + bonds = interaction_func(atoms) + except Exception as e: + log_message(f"Error processing PDB file {files}: {e}", "ERROR") + continue + + # If no bonds were found, skip this file + if len(bonds) == 0: + log_message(f"No {bond_type} found in file {files}, skipping.", "WARNING") + continue + + processed_files += 1 # Increment successfully processed files + fixed_files.append(fixed_pdb_path) + + if processed_files == 1: # First valid file with bonds determines target indices + for entries in bonds: + ent = list(np.sort((int(entries[0][3:]), int(entries[3][3:])))) # Ensure integers + tar_bond_ind.append(ent) + tar_bond_ind = np.unique(np.array(tar_bond_ind), axis=0).astype(int) + count = np.zeros(tar_bond_ind.shape[0], dtype=int) + + bond_ind = [] + for entries in bonds: + ent = list(np.sort((int(entries[0][3:]), int(entries[3][3:])))) # Ensure integers + bond_ind.append(ent) + bond_ind = np.unique(np.array(bond_ind), axis=0) + + # Ensure bond_ind is a 2D array + if bond_ind.ndim == 1: + bond_ind = bond_ind.reshape(-1, 2) # Reshape to (n, 2) if it's 1D + + for j, pairs in enumerate(tar_bond_ind): + ind1_matches = np.where(mapping_num[0] == pairs[0])[0] + ind2_matches = np.where(mapping_num[0] == pairs[1])[0] + + if ind1_matches.size > 0 and ind2_matches.size > 0: + if processed_files - 1 < mapping_num.shape[0]: + ind1 = mapping_num[processed_files - 1, ind1_matches[0]] + ind2 = mapping_num[processed_files - 1, ind2_matches[0]] + + if not (np.isnan(ind1) or np.isnan(ind2)): + index = np.where(np.logical_and(bond_ind[:, 0] == int(ind1), bond_ind[:, 1] == int(ind2)))[0] + if index.size != 0: + count[j] += 1 + else: + log_message(f"Skipping file {files} due to index out of bounds error", "WARNING") + else: + log_message(f"No matching indices found for {pairs} in {files}", "WARNING") + + # If no files were successfully processed or no bonds were found + if processed_files == 0 or len(tar_bond_ind) == 0: + log_message(f"No valid {bond_type} entries found across all PDB files.", "ERROR") + return None + + count_reshaped = count.reshape(-1, 1) + count_normalized = (count / processed_files * 100).reshape(-1, 1) + + # Modify tar_bond_ind to append the amino acid code before residue index + tar_bond_with_aa = [] + for bond in tar_bond_ind: + res1 = append_residue_code(bond[0], residue_dict) # Append AA code for Res1 + res2 = append_residue_code(bond[1], residue_dict) # Append AA code for Res2 + tar_bond_with_aa.append([res1, res2]) + + tar_bond_with_aa = np.array(tar_bond_with_aa) + + # Combine tar_bond_with_aa with count and percentage + output_data = np.hstack((tar_bond_with_aa, count_reshaped, count_normalized)) + + log_message(f"Finished processing {processed_files} PDB files.") + output_filename = f'{bond_type}_consensus.txt' + + # Save the result with amino acid codes and numeric values + np.savetxt(output_filename, output_data, fmt='%s %s %s %s', delimiter=' ', + header='Res1 Res2 Count Percentage', comments='') + + return output_data, fixed_files # Return fixed_files to remove later + + +def plot_barh(result, bond_type, n_per_plot=None, min_height=8): + """Plot horizontal bar plots of percentages, splitting the data into fixed-sized plots.""" + plt.rcParams.update({'font.size': 20}) + + # Set default value for n_per_plot if None is passed + if n_per_plot is None: + n_per_plot = 20 + + # Error handling if result is None or empty + if result is None or len(result) == 0: + log_message(f"Skipping plot for {bond_type} due to insufficient data.", "ERROR") + return + + num_entries = result.shape[0] + num_plots = (num_entries + n_per_plot - 1) // n_per_plot # Number of plots required + + for plot_idx in range(num_plots): + # Slice the data for the current plot (take the next 'n_per_plot' entries) + start_idx = plot_idx * n_per_plot + end_idx = min((plot_idx + 1) * n_per_plot, num_entries) + result_chunk = result[start_idx:end_idx] + + log_message(f"Plotting entries {start_idx+1} to {end_idx} for {bond_type}.") + + # Use residue numbers for y-axis labels + y_labels = [f"{str(row[0])}-{str(row[1])}" for row in result_chunk] + percentage_values = result_chunk[:, 3].astype('float') + + norm = plt.Normalize(vmin=0, vmax=100) + cmap = plt.cm.get_cmap('coolwarm') + + # Set the figure height with a minimum height of `min_height` for small datasets + fig_height = max(min_height, len(result_chunk) * 0.4) + plt.figure(figsize=(18, fig_height)) + bars = plt.barh(y_labels, percentage_values, color=cmap(norm(percentage_values))) + + sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm) + sm.set_array([]) + + # Adjust color bar height to match the number of entries + plt.colorbar(sm, label='Percentage', fraction=0.02, pad=0.04) + + plt.ylim(-1, result_chunk.shape[0]) + plt.ylabel(f'{bond_type} Pairs of Residue Numbers') + plt.xlabel('Percentage') + plt.title(f'Persistence of {bond_type} Bonds (entries {start_idx+1}-{end_idx})') + + # Save each plot with an incremented filename for multiple plots + output_plot_file = f'{bond_type}_plot_part{plot_idx + 1}.png' + log_message(f"Saving plot to: {output_plot_file}") + plt.savefig(output_plot_file) + plt.close() + log_message(f"Plot saved successfully.") + def calcHydrophobicOverlapingAreas(atoms, **kwargs): """Provide information about hydrophobic contacts between pairs of residues based on @@ -3010,6 +3239,62 @@ def showSminaTermValues(data): return show +def calcSignatureInteractions(mapping_file, PDB_folder, fixer='pdbfixer'): + """Analyzes protein structures to identify various interactions using InSty. + Processes data from the MSA file and folder with selected models. + + :arg mapping_file: Aligned residue indices, MSA file type + :type mapping_file: str + + :arg PDB_folder: Directory containing PDB model files + :type PDB_folder: str + + :arg fixer: The method for fixing lack of hydrogen bonds + :type fixer: 'pdbfixer' or 'openbabel' + """ + + import os + functions = { + "HydrogenBonds": calcHydrogenBonds, + "SaltBridges": calcSaltBridges, + "RepulsiveIonicBonding": calcRepulsiveIonicBonding, + "PiStacking": calcPiStacking, + "PiCation": calcPiCation, + "Hydrophobic": calcHydrophobic, + "DisulfideBonds": calcDisulfideBonds + } + + # Process each bond type + for bond_type, func in functions.items(): + # Check if the consensus file already exists + #consensus_file = f'{bond_type}_consensus.txt' + #if os.path.exists(consensus_file): + # log_message(f"Consensus file for {bond_type} already exists, skipping.", "INFO") + # continue + + log_message(f"Processing {bond_type}") + + result = process_data(mapping_file, PDB_folder, func, bond_type, fixer) + + # Check if the result is None (no valid bonds found) + if result is None: + log_message(f"No valid {bond_type} entries found, skipping further processing.", "WARNING") + continue + + result, fixed_files = result + + # Proceed with plotting + n = None + plot_barh(result, bond_type, n) + + # Remove all fixed files at the end + if 'fixed_files' in locals(): + for fixed_file in fixed_files: + if os.path.exists(fixed_file): + os.remove(fixed_file) + log_message(f"Removed fixed file: {fixed_file}") + + class Interactions(object): From 4505312a13bfe4d2ef5fe7adbf637dc991a90abe Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Sat, 9 Nov 2024 23:01:05 +0100 Subject: [PATCH 02/30] fstrings removed --- prody/proteins/interactions.py | 58 ++++++++++++++++------------------ 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index c741b426c..2c47b9753 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -321,7 +321,7 @@ def remove_empty_strings(row): def log_message(message, level="INFO"): """Log a message with a specified log level.""" - print(f"[{level}] {message}") + print("[{}] {}".format(level, message)) def is_module_installed(module_name): """Check if a Python module is installed.""" @@ -345,24 +345,24 @@ def load_residues_from_pdb(pdb_file): one_letter_code = three_to_one(resname) # Convert to one-letter code residue_dict[resnum] = one_letter_code except KeyError: - log_message(f"Unknown residue: {resname} at position {resnum}", "WARNING") - + log_message("Unknown residue: {} at position {}".format(resname, resnum), "WARNING") + return residue_dict def append_residue_code(residue_num, residue_dict): """Return a string with one-letter amino acid code and residue number.""" aa_code = residue_dict.get(residue_num, "X") # Use "X" for unknown residues - return f"{aa_code}{residue_num}" + return "{}{}".format(aa_code, residue_num) def process_data(mapping_file, pdb_folder, interaction_func, bond_type, fixer): """Process the mapping file and the PDB folder to compute interaction counts and percentages.""" - log_message(f"Loading mapping file: {mapping_file}") + log_message("Loading mapping file: {}".format(mapping_file)) # Load and clean the mapping file try: mapping = np.loadtxt(mapping_file, delimiter=' ', dtype=str) except Exception as e: - log_message(f"Error loading mapping file: {e}", "ERROR") + log_message("Error loading mapping file: {}".format(e), "ERROR") return None mapping = np.where(mapping == '-', np.nan, mapping) @@ -373,7 +373,7 @@ def process_data(mapping_file, pdb_folder, interaction_func, bond_type, fixer): pdb_model_path = os.path.join(pdb_folder, 'model1.pdb') residue_dict = load_residues_from_pdb(pdb_model_path) - log_message(f"Processing PDB files in folder: {pdb_folder}") + log_message("Processing PDB files in folder: {}".format(pdb_folder)) tar_bond_ind = [] processed_files = 0 # To track the number of files successfully processed @@ -382,24 +382,24 @@ def process_data(mapping_file, pdb_folder, interaction_func, bond_type, fixer): for i, files in enumerate(os.listdir(pdb_folder)): # Skip any file that has already been processed (files with 'addH_' prefix) if files.startswith('addH_'): - log_message(f"Skipping already fixed file: {files}", "INFO") + log_message("Skipping already fixed file: {}".format(files), "INFO") continue - log_message(f"Processing file {i+1}: {files}") - + log_message("Processing file {}: {}".format(i + 1, files)) + pdb_file_path = os.path.join(pdb_folder, files) fixed_pdb_path = pdb_file_path.replace(files, 'addH_' + files) # Check if the fixed file already exists, skip fixing if it does if not os.path.exists(fixed_pdb_path): try: - log_message(f"Running fixer on {pdb_file_path} using {fixer}.") + log_message("Running fixer on {} using {}.".format(pdb_file_path, fixer)) addMissingAtoms(pdb_file_path, method=fixer) except Exception as e: - log_message(f"Error adding missing atoms: {e}", "ERROR") + log_message("Error adding missing atoms: {}".format(e), "ERROR") continue else: - log_message(f"Using existing fixed file: {fixed_pdb_path}") + log_message("Using existing fixed file: {}".format(fixed_pdb_path)) try: coords = parsePDB(fixed_pdb_path) @@ -407,12 +407,12 @@ def process_data(mapping_file, pdb_folder, interaction_func, bond_type, fixer): interactions = Interactions() bonds = interaction_func(atoms) except Exception as e: - log_message(f"Error processing PDB file {files}: {e}", "ERROR") + log_message("Error processing PDB file {}: {}".format(files, e), "ERROR") continue # If no bonds were found, skip this file if len(bonds) == 0: - log_message(f"No {bond_type} found in file {files}, skipping.", "WARNING") + log_message("No {} found in file {}, skipping.".format(bond_type, files), "WARNING") continue processed_files += 1 # Increment successfully processed files @@ -449,13 +449,13 @@ def process_data(mapping_file, pdb_folder, interaction_func, bond_type, fixer): if index.size != 0: count[j] += 1 else: - log_message(f"Skipping file {files} due to index out of bounds error", "WARNING") + log_message("Skipping file {} due to index out of bounds error".format(files), "WARNING") else: - log_message(f"No matching indices found for {pairs} in {files}", "WARNING") + log_message("No matching indices found for {} in {}".format(pairs, files), "WARNING") # If no files were successfully processed or no bonds were found if processed_files == 0 or len(tar_bond_ind) == 0: - log_message(f"No valid {bond_type} entries found across all PDB files.", "ERROR") + log_message("No valid {} entries found across all PDB files.".format(bond_type), "ERROR") return None count_reshaped = count.reshape(-1, 1) @@ -473,8 +473,8 @@ def process_data(mapping_file, pdb_folder, interaction_func, bond_type, fixer): # Combine tar_bond_with_aa with count and percentage output_data = np.hstack((tar_bond_with_aa, count_reshaped, count_normalized)) - log_message(f"Finished processing {processed_files} PDB files.") - output_filename = f'{bond_type}_consensus.txt' + log_message("Finished processing {} PDB files.".format(processed_files)) + output_filename = '{}_consensus.txt'.format(bond_type) # Save the result with amino acid codes and numeric values np.savetxt(output_filename, output_data, fmt='%s %s %s %s', delimiter=' ', @@ -493,7 +493,7 @@ def plot_barh(result, bond_type, n_per_plot=None, min_height=8): # Error handling if result is None or empty if result is None or len(result) == 0: - log_message(f"Skipping plot for {bond_type} due to insufficient data.", "ERROR") + log_message("Skipping plot for {} due to insufficient data.".format(bond_type), "ERROR") return num_entries = result.shape[0] @@ -505,10 +505,9 @@ def plot_barh(result, bond_type, n_per_plot=None, min_height=8): end_idx = min((plot_idx + 1) * n_per_plot, num_entries) result_chunk = result[start_idx:end_idx] - log_message(f"Plotting entries {start_idx+1} to {end_idx} for {bond_type}.") - + log_message("Plotting entries {} to {} for {}.".format(start_idx + 1, end_idx, bond_type)) # Use residue numbers for y-axis labels - y_labels = [f"{str(row[0])}-{str(row[1])}" for row in result_chunk] + y_labels = ["{}-{}".format(str(row[0]), str(row[1])) for row in result_chunk] percentage_values = result_chunk[:, 3].astype('float') norm = plt.Normalize(vmin=0, vmax=100) @@ -526,16 +525,15 @@ def plot_barh(result, bond_type, n_per_plot=None, min_height=8): plt.colorbar(sm, label='Percentage', fraction=0.02, pad=0.04) plt.ylim(-1, result_chunk.shape[0]) - plt.ylabel(f'{bond_type} Pairs of Residue Numbers') + plt.ylabel('{} Pairs of Residue Numbers'.format(bond_type)) plt.xlabel('Percentage') - plt.title(f'Persistence of {bond_type} Bonds (entries {start_idx+1}-{end_idx})') - + plt.title('Persistence of {} Bonds (entries {}-{})'.format(bond_type, start_idx + 1, end_idx)) # Save each plot with an incremented filename for multiple plots - output_plot_file = f'{bond_type}_plot_part{plot_idx + 1}.png' - log_message(f"Saving plot to: {output_plot_file}") + output_plot_file = '{}_plot_part{}.png'.format(bond_type, plot_idx + 1) + log_message("Saving plot to: {}".format(output_plot_file)) plt.savefig(output_plot_file) plt.close() - log_message(f"Plot saved successfully.") + log_message("Plot saved successfully.") def calcHydrophobicOverlapingAreas(atoms, **kwargs): From b8cdafca125e70015badbb8cd7fcb5b664fb5abb Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Sat, 9 Nov 2024 23:08:05 +0100 Subject: [PATCH 03/30] replacing additional fstrings --- prody/proteins/interactions.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index 2c47b9753..7f66f841b 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -3265,18 +3265,17 @@ def calcSignatureInteractions(mapping_file, PDB_folder, fixer='pdbfixer'): # Process each bond type for bond_type, func in functions.items(): # Check if the consensus file already exists - #consensus_file = f'{bond_type}_consensus.txt' - #if os.path.exists(consensus_file): - # log_message(f"Consensus file for {bond_type} already exists, skipping.", "INFO") - # continue + consensus_file = '{}_consensus.txt'.format(bond_type) + if os.path.exists(consensus_file): + log_message("Consensus file for {} already exists, skipping.".format(bond_type), "INFO") + continue - log_message(f"Processing {bond_type}") - + log_message("Processing {}".format(bond_type)) result = process_data(mapping_file, PDB_folder, func, bond_type, fixer) # Check if the result is None (no valid bonds found) if result is None: - log_message(f"No valid {bond_type} entries found, skipping further processing.", "WARNING") + log_message("No valid {} entries found, skipping further processing.".format(bond_type), "WARNING") continue result, fixed_files = result @@ -3290,7 +3289,7 @@ def calcSignatureInteractions(mapping_file, PDB_folder, fixer='pdbfixer'): for fixed_file in fixed_files: if os.path.exists(fixed_file): os.remove(fixed_file) - log_message(f"Removed fixed file: {fixed_file}") + log_message("Removed fixed file: {}".format(fixed_file)) From 6345be8d149686550f56fbec9028b12d8b6918c0 Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Sat, 9 Nov 2024 23:53:39 +0100 Subject: [PATCH 04/30] extractMultiModelPDB --- prody/proteins/interactions.py | 44 ++++++++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index 7f66f841b..12257cce7 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -3237,9 +3237,49 @@ def showSminaTermValues(data): return show +def extractMultiModelPDB(multimodelPDB, folder_name): + """Extracts individual PDB models from multimodel PDB and places them into the pointed directory. + If used for calculating calcSignatureInteractions align the models. + + :arg multimodelPDB: The file containing models in multi-model PDB format + :type multimodelPDB: str + + :arg folder_name: The name of the folder to which PDBs will be extracted + :type folder_name: str + """ + + folder_name = kwargs.pop('folder_name', 'struc_homologs') + + with open(multimodelPDB, 'r') as f: + file = f.readlines() + os.makedirs(folder_name, exist_ok=True) + + fp = None + for line in file: + line = line.strip() + sig1 = line[:5] + sig2 = line[:6] + sig3 = line[:4] + + if sig1 == 'MODEL': + model_number = line.split()[1] + filename = 'model{}.pdb'.format(model_number) + fp = open(filename, 'w') + continue + + if sig2 == 'ENDMDL': + if fp: + fp.close() + os.rename(filename, './{}/{}'.format(folder_name,filename)) + continue + + if sig3 == 'ATOM' and fp: + fp.write("{}\n".format(line)) + + def calcSignatureInteractions(mapping_file, PDB_folder, fixer='pdbfixer'): - """Analyzes protein structures to identify various interactions using InSty. - Processes data from the MSA file and folder with selected models. + """Analyzes protein structures to identify various interactions using InSty. + Processes data from the MSA file and folder with selected models. :arg mapping_file: Aligned residue indices, MSA file type :type mapping_file: str From f088e5a7567a5635e4f4e9d49e111ba49157542f Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Sun, 10 Nov 2024 09:17:01 +0100 Subject: [PATCH 05/30] fstrings replaced --- prody/proteins/interactions.py | 78 +++++++++++++++++----------------- 1 file changed, 38 insertions(+), 40 deletions(-) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index c741b426c..0a49dcaa4 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -34,6 +34,8 @@ import multiprocessing import matplotlib.pylab as plt import os +from . import fixer + __all__ = ['calcHydrogenBonds', 'calcChHydrogenBonds', 'calcSaltBridges', 'calcRepulsiveIonicBonding', 'calcPiStacking', 'calcPiCation', @@ -49,7 +51,7 @@ 'Interactions', 'InteractionsTrajectory', 'LigandInteractionsTrajectory', 'calcSminaBindingAffinity', 'calcSminaPerAtomInteractions', 'calcSminaTermValues', 'showSminaTermValues', 'showPairEnergy', 'checkNonstandardResidues', - 'saveInteractionsAsDummyAtoms', 'calcSignatureInteractions'] + 'saveInteractionsAsDummyAtoms', 'extractMultiModelPDB', 'calcSignatureInteractions'] def cleanNumbers(listContacts): @@ -321,7 +323,7 @@ def remove_empty_strings(row): def log_message(message, level="INFO"): """Log a message with a specified log level.""" - print(f"[{level}] {message}") + print("[{}] {}".format(level, message)) def is_module_installed(module_name): """Check if a Python module is installed.""" @@ -345,24 +347,24 @@ def load_residues_from_pdb(pdb_file): one_letter_code = three_to_one(resname) # Convert to one-letter code residue_dict[resnum] = one_letter_code except KeyError: - log_message(f"Unknown residue: {resname} at position {resnum}", "WARNING") - + log_message("Unknown residue: {} at position {}".format(resname, resnum), "WARNING") + return residue_dict def append_residue_code(residue_num, residue_dict): """Return a string with one-letter amino acid code and residue number.""" aa_code = residue_dict.get(residue_num, "X") # Use "X" for unknown residues - return f"{aa_code}{residue_num}" + return "{}{}".format(aa_code, residue_num) def process_data(mapping_file, pdb_folder, interaction_func, bond_type, fixer): """Process the mapping file and the PDB folder to compute interaction counts and percentages.""" - log_message(f"Loading mapping file: {mapping_file}") + log_message("Loading mapping file: {}".format(mapping_file)) # Load and clean the mapping file try: mapping = np.loadtxt(mapping_file, delimiter=' ', dtype=str) except Exception as e: - log_message(f"Error loading mapping file: {e}", "ERROR") + log_message("Error loading mapping file: {}".format(e), "ERROR") return None mapping = np.where(mapping == '-', np.nan, mapping) @@ -373,7 +375,7 @@ def process_data(mapping_file, pdb_folder, interaction_func, bond_type, fixer): pdb_model_path = os.path.join(pdb_folder, 'model1.pdb') residue_dict = load_residues_from_pdb(pdb_model_path) - log_message(f"Processing PDB files in folder: {pdb_folder}") + log_message("Processing PDB files in folder: {}".format(pdb_folder)) tar_bond_ind = [] processed_files = 0 # To track the number of files successfully processed @@ -382,24 +384,24 @@ def process_data(mapping_file, pdb_folder, interaction_func, bond_type, fixer): for i, files in enumerate(os.listdir(pdb_folder)): # Skip any file that has already been processed (files with 'addH_' prefix) if files.startswith('addH_'): - log_message(f"Skipping already fixed file: {files}", "INFO") + log_message("Skipping already fixed file: {}".format(files), "INFO") continue - log_message(f"Processing file {i+1}: {files}") - + log_message("Processing file {}: {}".format(i + 1, files)) + pdb_file_path = os.path.join(pdb_folder, files) fixed_pdb_path = pdb_file_path.replace(files, 'addH_' + files) # Check if the fixed file already exists, skip fixing if it does if not os.path.exists(fixed_pdb_path): try: - log_message(f"Running fixer on {pdb_file_path} using {fixer}.") + log_message("Running fixer on {} using {}.".format(pdb_file_path, fixer)) addMissingAtoms(pdb_file_path, method=fixer) except Exception as e: - log_message(f"Error adding missing atoms: {e}", "ERROR") + log_message("Error adding missing atoms: {}".format(e), "ERROR") continue else: - log_message(f"Using existing fixed file: {fixed_pdb_path}") + log_message("Using existing fixed file: {}".format(fixed_pdb_path)) try: coords = parsePDB(fixed_pdb_path) @@ -407,12 +409,12 @@ def process_data(mapping_file, pdb_folder, interaction_func, bond_type, fixer): interactions = Interactions() bonds = interaction_func(atoms) except Exception as e: - log_message(f"Error processing PDB file {files}: {e}", "ERROR") + log_message("Error processing PDB file {}: {}".format(files, e), "ERROR") continue # If no bonds were found, skip this file if len(bonds) == 0: - log_message(f"No {bond_type} found in file {files}, skipping.", "WARNING") + log_message("No {} found in file {}, skipping.".format(bond_type, files), "WARNING") continue processed_files += 1 # Increment successfully processed files @@ -449,13 +451,13 @@ def process_data(mapping_file, pdb_folder, interaction_func, bond_type, fixer): if index.size != 0: count[j] += 1 else: - log_message(f"Skipping file {files} due to index out of bounds error", "WARNING") + log_message("Skipping file {} due to index out of bounds error".format(files), "WARNING") else: - log_message(f"No matching indices found for {pairs} in {files}", "WARNING") + log_message("No matching indices found for {} in {}".format(pairs, files), "WARNING") # If no files were successfully processed or no bonds were found if processed_files == 0 or len(tar_bond_ind) == 0: - log_message(f"No valid {bond_type} entries found across all PDB files.", "ERROR") + log_message("No valid {} entries found across all PDB files.".format(bond_type), "ERROR") return None count_reshaped = count.reshape(-1, 1) @@ -473,8 +475,8 @@ def process_data(mapping_file, pdb_folder, interaction_func, bond_type, fixer): # Combine tar_bond_with_aa with count and percentage output_data = np.hstack((tar_bond_with_aa, count_reshaped, count_normalized)) - log_message(f"Finished processing {processed_files} PDB files.") - output_filename = f'{bond_type}_consensus.txt' + log_message("Finished processing {} PDB files.".format(processed_files)) + output_filename = '{}_consensus.txt'.format(bond_type) # Save the result with amino acid codes and numeric values np.savetxt(output_filename, output_data, fmt='%s %s %s %s', delimiter=' ', @@ -493,7 +495,7 @@ def plot_barh(result, bond_type, n_per_plot=None, min_height=8): # Error handling if result is None or empty if result is None or len(result) == 0: - log_message(f"Skipping plot for {bond_type} due to insufficient data.", "ERROR") + log_message("Skipping plot for {} due to insufficient data.".format(bond_type), "ERROR") return num_entries = result.shape[0] @@ -505,10 +507,9 @@ def plot_barh(result, bond_type, n_per_plot=None, min_height=8): end_idx = min((plot_idx + 1) * n_per_plot, num_entries) result_chunk = result[start_idx:end_idx] - log_message(f"Plotting entries {start_idx+1} to {end_idx} for {bond_type}.") - + log_message("Plotting entries {} to {} for {}.".format(start_idx + 1, end_idx, bond_type)) # Use residue numbers for y-axis labels - y_labels = [f"{str(row[0])}-{str(row[1])}" for row in result_chunk] + y_labels = ["{}-{}".format(str(row[0]), str(row[1])) for row in result_chunk] percentage_values = result_chunk[:, 3].astype('float') norm = plt.Normalize(vmin=0, vmax=100) @@ -526,16 +527,15 @@ def plot_barh(result, bond_type, n_per_plot=None, min_height=8): plt.colorbar(sm, label='Percentage', fraction=0.02, pad=0.04) plt.ylim(-1, result_chunk.shape[0]) - plt.ylabel(f'{bond_type} Pairs of Residue Numbers') + plt.ylabel('{} Pairs of Residue Numbers'.format(bond_type)) plt.xlabel('Percentage') - plt.title(f'Persistence of {bond_type} Bonds (entries {start_idx+1}-{end_idx})') - + plt.title('Persistence of {} Bonds (entries {}-{})'.format(bond_type, start_idx + 1, end_idx)) # Save each plot with an incremented filename for multiple plots - output_plot_file = f'{bond_type}_plot_part{plot_idx + 1}.png' - log_message(f"Saving plot to: {output_plot_file}") + output_plot_file = '{}_plot_part{}.png'.format(bond_type, plot_idx + 1) + log_message("Saving plot to: {}".format(output_plot_file)) plt.savefig(output_plot_file) plt.close() - log_message(f"Plot saved successfully.") + log_message("Plot saved successfully.") def calcHydrophobicOverlapingAreas(atoms, **kwargs): @@ -3253,7 +3253,6 @@ def calcSignatureInteractions(mapping_file, PDB_folder, fixer='pdbfixer'): :type fixer: 'pdbfixer' or 'openbabel' """ - import os functions = { "HydrogenBonds": calcHydrogenBonds, "SaltBridges": calcSaltBridges, @@ -3267,18 +3266,17 @@ def calcSignatureInteractions(mapping_file, PDB_folder, fixer='pdbfixer'): # Process each bond type for bond_type, func in functions.items(): # Check if the consensus file already exists - #consensus_file = f'{bond_type}_consensus.txt' - #if os.path.exists(consensus_file): - # log_message(f"Consensus file for {bond_type} already exists, skipping.", "INFO") - # continue + consensus_file = '{}_consensus.txt'.format(bond_type) + if os.path.exists(consensus_file): + log_message("Consensus file for {} already exists, skipping.".format(bond_type), "INFO") + continue - log_message(f"Processing {bond_type}") - + log_message("Processing {}".format(bond_type)) result = process_data(mapping_file, PDB_folder, func, bond_type, fixer) # Check if the result is None (no valid bonds found) if result is None: - log_message(f"No valid {bond_type} entries found, skipping further processing.", "WARNING") + log_message("No valid {} entries found, skipping further processing.".format(bond_type), "WARNING") continue result, fixed_files = result @@ -3292,7 +3290,7 @@ def calcSignatureInteractions(mapping_file, PDB_folder, fixer='pdbfixer'): for fixed_file in fixed_files: if os.path.exists(fixed_file): os.remove(fixed_file) - log_message(f"Removed fixed file: {fixed_file}") + log_message("Removed fixed file: {}".format(fixed_file)) From 0b8da0fa956fe0d0c206c41b721a9779e1d2bae3 Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Sun, 10 Nov 2024 09:59:03 +0100 Subject: [PATCH 06/30] fixer import --- prody/proteins/interactions.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index 7fd66f66d..1787022f2 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -34,7 +34,7 @@ import multiprocessing import matplotlib.pylab as plt import os -from . import fixer +from .fixer import * __all__ = ['calcHydrogenBonds', 'calcChHydrogenBonds', 'calcSaltBridges', @@ -3280,8 +3280,8 @@ def extractMultiModelPDB(multimodelPDB, folder_name): def calcSignatureInteractions(mapping_file, PDB_folder, fixer='pdbfixer'): - """Analyzes protein structures to identify various interactions using InSty. - Processes data from the MSA file and folder with selected models. + """Analyzes protein structures to identify various interactions using InSty. + Processes data from the MSA file and folder with selected models. :arg mapping_file: Aligned residue indices, MSA file type :type mapping_file: str From 1260011205a3a7695dc16978ad7432e819383c0d Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Sun, 10 Nov 2024 17:13:28 +0100 Subject: [PATCH 07/30] extractMultiModelPDB improvements --- prody/proteins/interactions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index 1787022f2..428e04be2 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -3239,7 +3239,7 @@ def showSminaTermValues(data): return show -def extractMultiModelPDB(multimodelPDB, folder_name): +def extractMultiModelPDB(multimodelPDB, **kwargs): """Extracts individual PDB models from multimodel PDB and places them into the pointed directory. If used for calculating calcSignatureInteractions align the models. From f2956d96bcc5a099f4a80e646e19de91ffcb7c89 Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Sun, 10 Nov 2024 17:44:15 +0100 Subject: [PATCH 08/30] createFoldseekAlignment --- prody/proteins/interactions.py | 70 +++++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index 428e04be2..1e7c4a106 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -51,7 +51,8 @@ 'Interactions', 'InteractionsTrajectory', 'LigandInteractionsTrajectory', 'calcSminaBindingAffinity', 'calcSminaPerAtomInteractions', 'calcSminaTermValues', 'showSminaTermValues', 'showPairEnergy', 'checkNonstandardResidues', - 'saveInteractionsAsDummyAtoms', 'extractMultiModelPDB', 'calcSignatureInteractions'] + 'saveInteractionsAsDummyAtoms', 'createFoldseekAlignment', + 'extractMultiModelPDB', 'calcSignatureInteractions'] def cleanNumbers(listContacts): @@ -3239,6 +3240,73 @@ def showSminaTermValues(data): return show +def createFoldseekAlignment(prot_seq, prot_foldseek, *kwargs): + """Aligns sequences from prot_seq with homologous sequences identified in prot_foldseek, + generating a multiple sequence alignment. + + :arg prot_seq: The natural sequence extracted from the PDB (seq file) + :type prot_seq: str + + :arg prot_foldseek: The results from foldseek (foldseek file) + :type prot_foldseek: str + + :arg msa_output_name: The natural sequence extracted from the PDB (msa file) + :type msa_output_name: str + """ + + msa_output_name = kwargs.pop('msa_output_name', 'prot_struc.msa') + + def find_match_index(tar_nogap, nat_seq): + tar_nogap_str = ''.join(tar_nogap) + nat_seq_str = ''.join(nat_seq) + index = nat_seq_str.find(tar_nogap_str) + return index + + # Read input files + with open(prot_seq, 'r') as f: + file1 = f.readlines() + + with open(prot_foldseek, 'r') as f: + file2 = f.readlines() + + # Open output file + with open(msa_output_name, 'w') as fp: + nat_seq = list(file1[0].strip()) + + # Write the natural sequence to the output file + fp.write(''.join(nat_seq) + "\n") + + # Process each foldseek entry + for line in file2: + entries = line.split() + + if float(entries[2]) >= 0.5: + tar_seq = list(entries[11].strip()) + mat_seq = list(entries[12].strip()) + + tar_nogap = [] + processed_mat = [] + + for j in range(len(tar_seq)): + if tar_seq[j] != '-': + tar_nogap.append(tar_seq[j]) + processed_mat.append(mat_seq[j]) + + match_index = find_match_index(tar_nogap, nat_seq) + end_index = match_index + len(tar_nogap) + m = 0 + + for l in range(len(nat_seq)): + if l < match_index: + fp.write("-") + elif l >= match_index and l < end_index: + fp.write(processed_mat[m]) + m += 1 + else: + fp.write("-") + fp.write("\n") + + def extractMultiModelPDB(multimodelPDB, **kwargs): """Extracts individual PDB models from multimodel PDB and places them into the pointed directory. If used for calculating calcSignatureInteractions align the models. From ca717195fd747d34d44d8a751a60795f63fdef30 Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Sun, 10 Nov 2024 19:02:17 +0100 Subject: [PATCH 09/30] fix in kwargs, createFoldseekAlignment --- prody/proteins/interactions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index 1e7c4a106..6df421e9a 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -3240,7 +3240,7 @@ def showSminaTermValues(data): return show -def createFoldseekAlignment(prot_seq, prot_foldseek, *kwargs): +def createFoldseekAlignment(prot_seq, prot_foldseek, **kwargs): """Aligns sequences from prot_seq with homologous sequences identified in prot_foldseek, generating a multiple sequence alignment. From f736b8a43f143bc9972fb7af7fc9f22c49542d81 Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Sun, 10 Nov 2024 21:05:11 +0100 Subject: [PATCH 10/30] LOGGER info added to SignInt functions --- prody/proteins/interactions.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index 6df421e9a..c94f359c5 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -324,7 +324,7 @@ def remove_empty_strings(row): def log_message(message, level="INFO"): """Log a message with a specified log level.""" - print("[{}] {}".format(level, message)) + LOGGER.info("[{}] {}".format(level, message)) def is_module_installed(module_name): """Check if a Python module is installed.""" @@ -3305,6 +3305,8 @@ def find_match_index(tar_nogap, nat_seq): else: fp.write("-") fp.write("\n") + + LOGGER.info("MSA file is now created, and saved as {}.".format(msa_output_name)) def extractMultiModelPDB(multimodelPDB, **kwargs): @@ -3317,7 +3319,8 @@ def extractMultiModelPDB(multimodelPDB, **kwargs): :arg folder_name: The name of the folder to which PDBs will be extracted :type folder_name: str """ - + import os + folder_name = kwargs.pop('folder_name', 'struc_homologs') with open(multimodelPDB, 'r') as f: @@ -3345,6 +3348,8 @@ def extractMultiModelPDB(multimodelPDB, **kwargs): if sig3 == 'ATOM' and fp: fp.write("{}\n".format(line)) + + LOGGER.info("Individual models are saved in {}.".format(folder_name)) def calcSignatureInteractions(mapping_file, PDB_folder, fixer='pdbfixer'): @@ -3360,6 +3365,7 @@ def calcSignatureInteractions(mapping_file, PDB_folder, fixer='pdbfixer'): :arg fixer: The method for fixing lack of hydrogen bonds :type fixer: 'pdbfixer' or 'openbabel' """ + import os functions = { "HydrogenBonds": calcHydrogenBonds, From 01126525b9a6050a03a19df11e43df94228bae72 Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Sun, 10 Nov 2024 22:42:24 +0100 Subject: [PATCH 11/30] runFoldseek is added --- prody/proteins/interactions.py | 296 ++++++++++++++++++++++++++++++++- 1 file changed, 295 insertions(+), 1 deletion(-) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index c94f359c5..fd433508a 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -51,7 +51,7 @@ 'Interactions', 'InteractionsTrajectory', 'LigandInteractionsTrajectory', 'calcSminaBindingAffinity', 'calcSminaPerAtomInteractions', 'calcSminaTermValues', 'showSminaTermValues', 'showPairEnergy', 'checkNonstandardResidues', - 'saveInteractionsAsDummyAtoms', 'createFoldseekAlignment', + 'saveInteractionsAsDummyAtoms', 'createFoldseekAlignment', 'runFoldseek', 'extractMultiModelPDB', 'calcSignatureInteractions'] @@ -3352,6 +3352,300 @@ def extractMultiModelPDB(multimodelPDB, **kwargs): LOGGER.info("Individual models are saved in {}.".format(folder_name)) +def runFoldseek(pdb_file, chain, coverage_threshold, tm_threshold): + """This script processes a PDB file to extract a specified chain's sequence, searches for + homologous structures using foldseek, and prepares alignment outputs for further analysis. + Before using the function, install Foldseek: + >>> conda install bioconda::foldseek + + :arg pdb_file: A PDB file path + :type pdb_file: str + + :arg chain: Chain identifier + :type chain: str + + :arg coverage_threshold: Coverage threshold + :type coverage_threshold: int, float + + :arg tm_threshold: TM-score threshold + :type tm_threshold: float + """ + + import os + import subprocess + import re + import sys + + # Define the amino acid conversion function + def aa_onelet(three_letter_code): + codes = { + 'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C', + 'GLN': 'Q', 'GLU': 'E', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', + 'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P', + 'SER': 'S', 'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V', + 'MSE': 'M' + } + return codes.get(three_letter_code) + + # Function to extract the sequence from the PDB file + def extract_sequence_from_pdb(pdb_file, chain, output_file): + sequence = [] + prev_resid = -9999 + + with open(pdb_file, 'r') as pdb: + for line in pdb: + if line.startswith("ATOM"): + ch = line[21] + resid = int(line[22:26].strip()) + aa = line[17:20].strip() + + if ch == chain and resid != prev_resid: + one_aa = aa_onelet(aa) + if one_aa: + sequence.append(one_aa) + prev_resid = resid + + with open(output_file, 'w') as seq_file: + seq_file.write(''.join(sequence)) + + # Inputs + fpath = pdb_file.strip() + chain = chain.strip() + cov_threshold = float(coverage_threshold.strip()) + tm_threshold = float(tm_threshold.strip()) + + # Filter PDB file + awk_command = "awk '{{if (substr($0, 22, 1) == \"{0}\") print}}'".format(chain) + subprocess.run("cat {0} | grep '^ATOM' | {1} > inp.pdb".format(fpath, awk_command), shell=True) + + # Run foldseek and other commands + subprocess.run([ + 'foldseek', 'easy-search', 'inp.pdb', '../../../foldseek/pdb', 'prot.foldseek', + 'tmp2', '--exhaustive-search', '1', '--format-output', + "query,target,qstart,qend,tstart,tend,qcov,tcov,qtmscore,ttmscore,rmsd,qaln,taln", + '-c', str(cov_threshold) + ]) + + # Extract sequence and write to prot.seq + extract_sequence_from_pdb('inp.pdb', chain, 'prot.seq') + createFoldseekAlignment('prot.seq', 'prot.foldseek', msa_output_name='prot_struc.msa') + + # Read input files + with open('inp.pdb', 'r') as f: + file1 = f.readlines() + + with open('prot.foldseek', 'r') as f: + file2 = f.readlines() + + with open('prot_struc.msa', 'r') as f: + file3 = f.readlines() + + # Open output files + fp1 = open("aligned_structures.pdb", 'w') + fp2 = open("shortlisted_resind.msa", 'w') + fp6 = open("seq_match_reconfirm.txt", 'w') + fp7 = open("aligned_structures_extended.pdb", 'w') + fp9 = open("shortlisted.foldseek", 'w') + fp10 = open("shortlisted.msa", 'w') + + # Initialize variables + mod_count = 1 + fp1.write("MODEL\t{0}\n".format(mod_count)) + fp7.write("MODEL\t{0}\n".format(mod_count)) + + seq1 = list(file3[0].strip()) + ind1 = 0 + prev_id = -9999 + + # Process input PDB file + for line in file1: + line = line.strip() + id_ = line[0:4] + ch = line[21] + resid = int(line[22:26].strip()) + aa = line[17:20] + + if id_ == 'ATOM' and ch == chain and resid != prev_id: + prev_id = resid + one_aa = aa_onelet(aa) + if one_aa == seq1[ind1]: + fp1.write("{0}\n".format(line)) + fp7.write("{0}\n".format(line)) + fp2.write("{0} ".format(resid)) + ind1 += 1 + else: + print("Mismatch in sequence and structure of Query protein at Index {0}".format(ind1)) + break + elif id_ == 'ATOM' and ch == chain and resid == prev_id: + fp1.write("{0}\n".format(line)) + fp7.write("{0}\n".format(line)) + + fp1.write("TER\nENDMDL\n\n") + fp7.write("TER\nENDMDL\n\n") + fp2.write("\n") + fp10.write("{0}\n".format(file3[0].strip())) + + # Processing foldseek results + os.makedirs("temp", exist_ok=True) + + for i, entry in enumerate(file2): + entries = re.split(r'\s+', entry.strip()) + + if float(entries[8]) < tm_threshold: + continue + + tstart = int(entries[4]) + tend = int(entries[5]) + pdb = entries[1][:4] + chain = entries[1][-1] + fname = "{0}.pdb".format(pdb) + + # Download and process the target PDB file + subprocess.run(['wget', '-P', 'temp', "https://files.rcsb.org/download/{0}".format(fname)]) + + awk_command = "awk '{{if (substr($0, 22, 1) == \"{0}\") print}}'".format(chain) + subprocess.run("cat ./temp/{0} | grep -E '^(ATOM|HETATM)' | {1} > target.pdb".format(fname, awk_command), shell=True) + + with open('target.pdb', 'r') as target_file: + file4 = target_file.readlines() + + qseq = list(entries[11]) + tseq = list(entries[12]) + + start_line = 0 + prevind = -9999 + tarind = 0 + + for j, line in enumerate(file4): + resid = int(line[22:26].strip()) + if resid != prevind: + prevind = resid + tarind += 1 + if tarind == tstart: + start_line = j + break + + prevind = -9999 + ind2 = 0 + j = start_line + flag2 = False + + with open("temp_coord.txt", 'w') as fp4, \ + open("temp_resind.txt", 'w') as fp3, \ + open("temp_seq.txt", 'w') as fp5: + + for k in range(len(qseq)): + if tseq[k] != '-' and qseq[k] != '-': + line = file4[j].strip() + resid = int(line[22:26].strip()) + aa = line[17:20] + one_aa = aa_onelet(aa) + + if one_aa == tseq[k]: + fp3.write("{0} ".format(resid)) + fp5.write("{0}".format(one_aa)) + prevind = resid + else: + print("Mismatch in sequence and structure of Target protein {0}{1} at line {2} Index {3}-{4} ne {5}".format( + pdb, chain, j, k, one_aa, tseq[k])) + flag2 = True + break + + while resid == prevind: + fp4.write("{0}\n".format(line)) + j += 1 + if j >= len(file4): + break + line = file4[j].strip() + resid = int(line[22:26].strip()) + elif tseq[k] != '-' and qseq[k] == '-': + line = file4[j].strip() + resid = int(line[22:26].strip()) + aa = line[17:20] + one_aa = aa_onelet(aa) + + if one_aa == tseq[k]: + prevind = resid + else: + print("Mismatch in sequence and structure of Target protein {0}{1} at Line {2} Index {3}-{4} ne {5}".format( + pdb, chain, j, k, one_aa, tseq[k])) + flag2 = True + break + + while resid == prevind: + j += 1 + if j >= len(file4): + break + line = file4[j].strip() + resid = int(line[22:26].strip()) + + elif tseq[k] == '-' and qseq[k] != '-': + fp3.write("- ") + fp5.write("-") + + if flag2: + continue + + with open("temp_coord.txt", 'r') as f: + tmpcord = f.readlines() + + with open("temp_resind.txt", 'r') as f: + tmpresind = f.readlines() + + with open("temp_seq.txt", 'r') as f: + tmpseq = f.readlines() + + ind3 = i + 1 + seq1 = list(file3[ind3].strip()) + seq2 = tmpresind[0].strip().split() + + fp9.write("{0}".format(file2[i])) + fp10.write("{0}\n".format(file3[ind3].strip())) + + for m in range(len(seq1)): + if seq1[m] == '-': + fp2.write("{0} ".format(seq1[m])) + else: + break + + for n in range(len(seq2)): + fp2.write("{0} ".format(seq2[n])) + fp6.write("{0}".format(seq1[m])) + m += 1 + + for o in range(m, len(seq1)): + fp2.write("{0} ".format(seq1[m])) + + fp2.write("\n") + fp6.write("\n{0}\n\n\n".format(tmpseq[0])) + + mod_count += 1 + fp1.write("MODEL\t{0}\n".format(mod_count)) + fp7.write("MODEL\t{0}\n".format(mod_count)) + + for line in file4: + if line.strip(): + fp7.write("{0}\n".format(line.strip())) + + for line in tmpcord: + fp1.write("{0}".format(line)) + + fp1.write("TER\nENDMDL\n\n") + fp7.write("TER\nENDMDL\n\n") + + # Cleanup + fp1.close() + fp2.close() + fp6.close() + fp7.close() + fp9.close() + fp10.close() + + extractMultiModelPDB('aligned_structures.pdb', folder_name='struc_homologs') + subprocess.run("rm -f inp.pdb prot.seq target.pdb temp_coord.txt temp_resind.txt temp_seq.txt", shell=True) + subprocess.run("rm -rf tmp2 temp", shell=True) + + def calcSignatureInteractions(mapping_file, PDB_folder, fixer='pdbfixer'): """Analyzes protein structures to identify various interactions using InSty. Processes data from the MSA file and folder with selected models. From f0565f2ed6bfcbff0ceb9f9186df085ad6f3ef56 Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Mon, 11 Nov 2024 19:22:03 +0100 Subject: [PATCH 12/30] LOGGER changes --- prody/proteins/interactions.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index fd433508a..44c1dd82c 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -3352,7 +3352,7 @@ def extractMultiModelPDB(multimodelPDB, **kwargs): LOGGER.info("Individual models are saved in {}.".format(folder_name)) -def runFoldseek(pdb_file, chain, coverage_threshold, tm_threshold): +def runFoldseek(pdb_file, chain, coverage_threshold, tm_threshold, **kwargs): """This script processes a PDB file to extract a specified chain's sequence, searches for homologous structures using foldseek, and prepares alignment outputs for further analysis. Before using the function, install Foldseek: @@ -3365,10 +3365,13 @@ def runFoldseek(pdb_file, chain, coverage_threshold, tm_threshold): :type chain: str :arg coverage_threshold: Coverage threshold - :type coverage_threshold: int, float + :type coverage_threshold: float :arg tm_threshold: TM-score threshold :type tm_threshold: float + + :arg database_folder: Folder with the database + :type database_folder: str """ import os @@ -3376,6 +3379,8 @@ def runFoldseek(pdb_file, chain, coverage_threshold, tm_threshold): import re import sys + database_folder = kwargs.pop('database_folder', '../../../foldseek/pdb') + # Define the amino acid conversion function def aa_onelet(three_letter_code): codes = { @@ -3413,6 +3418,8 @@ def extract_sequence_from_pdb(pdb_file, chain, output_file): chain = chain.strip() cov_threshold = float(coverage_threshold.strip()) tm_threshold = float(tm_threshold.strip()) + + print(cov_threshold, type(cov_threshold)) # Filter PDB file awk_command = "awk '{{if (substr($0, 22, 1) == \"{0}\") print}}'".format(chain) @@ -3420,12 +3427,12 @@ def extract_sequence_from_pdb(pdb_file, chain, output_file): # Run foldseek and other commands subprocess.run([ - 'foldseek', 'easy-search', 'inp.pdb', '../../../foldseek/pdb', 'prot.foldseek', + 'foldseek', 'easy-search', 'inp.pdb', database_folder, 'prot.foldseek', 'tmp2', '--exhaustive-search', '1', '--format-output', "query,target,qstart,qend,tstart,tend,qcov,tcov,qtmscore,ttmscore,rmsd,qaln,taln", '-c', str(cov_threshold) ]) - + # Extract sequence and write to prot.seq extract_sequence_from_pdb('inp.pdb', chain, 'prot.seq') createFoldseekAlignment('prot.seq', 'prot.foldseek', msa_output_name='prot_struc.msa') @@ -3644,7 +3651,7 @@ def extract_sequence_from_pdb(pdb_file, chain, output_file): extractMultiModelPDB('aligned_structures.pdb', folder_name='struc_homologs') subprocess.run("rm -f inp.pdb prot.seq target.pdb temp_coord.txt temp_resind.txt temp_seq.txt", shell=True) subprocess.run("rm -rf tmp2 temp", shell=True) - + def calcSignatureInteractions(mapping_file, PDB_folder, fixer='pdbfixer'): """Analyzes protein structures to identify various interactions using InSty. From ba342f928234037c974012194aa2bc1afb479494 Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Mon, 11 Nov 2024 19:27:47 +0100 Subject: [PATCH 13/30] cov-mode parameter to foldseek run --- prody/proteins/interactions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index 44c1dd82c..df6891306 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -3430,7 +3430,7 @@ def extract_sequence_from_pdb(pdb_file, chain, output_file): 'foldseek', 'easy-search', 'inp.pdb', database_folder, 'prot.foldseek', 'tmp2', '--exhaustive-search', '1', '--format-output', "query,target,qstart,qend,tstart,tend,qcov,tcov,qtmscore,ttmscore,rmsd,qaln,taln", - '-c', str(cov_threshold) + '-c', str(cov_threshold), '--cov-mode', '0' ]) # Extract sequence and write to prot.seq From 05b4621ab74d0f518e43fb08f283ae8a9c2ffac5 Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Mon, 11 Nov 2024 22:05:08 +0100 Subject: [PATCH 14/30] runFoldseek improvements with kwargs, docs --- prody/proteins/interactions.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index df6891306..d4a28930b 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -3352,11 +3352,16 @@ def extractMultiModelPDB(multimodelPDB, **kwargs): LOGGER.info("Individual models are saved in {}.".format(folder_name)) -def runFoldseek(pdb_file, chain, coverage_threshold, tm_threshold, **kwargs): +def runFoldseek(pdb_file, chain, **kwargs): """This script processes a PDB file to extract a specified chain's sequence, searches for homologous structures using foldseek, and prepares alignment outputs for further analysis. + Before using the function, install Foldseek: >>> conda install bioconda::foldseek + More information can be found: + https://github.com/steineggerlab/foldseek?tab=readme-ov-file#databasesand + + This function will not work under Windows. :arg pdb_file: A PDB file path :type pdb_file: str @@ -3365,12 +3370,16 @@ def runFoldseek(pdb_file, chain, coverage_threshold, tm_threshold, **kwargs): :type chain: str :arg coverage_threshold: Coverage threshold + by default 0.3 :type coverage_threshold: float :arg tm_threshold: TM-score threshold + by default 0.5 :type tm_threshold: float :arg database_folder: Folder with the database + by default '~/Donwload/foldseek/pdb' + To download the database use: 'foldseek databases PDB pdb tmp' in the console :type database_folder: str """ @@ -3379,8 +3388,10 @@ def runFoldseek(pdb_file, chain, coverage_threshold, tm_threshold, **kwargs): import re import sys - database_folder = kwargs.pop('database_folder', '../../../foldseek/pdb') - + database_folder = kwargs.pop('database_folder', '~/Donwload/foldseek/pdb') + coverage_threshold = kwargs.pop('coverage_threshold', 0.3) + tm_threshold = kwargs.pop('tm_threshold', 0.5) + # Define the amino acid conversion function def aa_onelet(three_letter_code): codes = { @@ -3416,11 +3427,9 @@ def extract_sequence_from_pdb(pdb_file, chain, output_file): # Inputs fpath = pdb_file.strip() chain = chain.strip() - cov_threshold = float(coverage_threshold.strip()) - tm_threshold = float(tm_threshold.strip()) + cov_threshold = float(coverage_threshold) + tm_threshold = float(tm_threshold) - print(cov_threshold, type(cov_threshold)) - # Filter PDB file awk_command = "awk '{{if (substr($0, 22, 1) == \"{0}\") print}}'".format(chain) subprocess.run("cat {0} | grep '^ATOM' | {1} > inp.pdb".format(fpath, awk_command), shell=True) From d31d183d8ab532f258f919df6560c50eeff17ffe Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Mon, 11 Nov 2024 22:31:44 +0100 Subject: [PATCH 15/30] changes in imports --- prody/proteins/interactions.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index d4a28930b..ee2b55f9a 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -32,8 +32,6 @@ from prody.ensemble import Ensemble import multiprocessing -import matplotlib.pylab as plt -import os from .fixer import * @@ -314,10 +312,6 @@ def showPairEnergy(data, **kwargs): # SignatureInteractions supporting functions -from Bio.PDB.Polypeptide import three_to_one -import shutil -import importlib.util - def remove_empty_strings(row): """Remove empty strings from a list.""" return [elem for elem in row if elem != ''] @@ -328,15 +322,18 @@ def log_message(message, level="INFO"): def is_module_installed(module_name): """Check if a Python module is installed.""" + import importlib.util spec = importlib.util.find_spec(module_name) return spec is not None def is_command_installed(command): """Check if a command-line tool is installed.""" + import shutil return shutil.which(command) is not None def load_residues_from_pdb(pdb_file): """Extract residue numbers and their corresponding one-letter amino acid codes from a PDB file.""" + from Bio.PDB.Polypeptide import three_to_one structure = parsePDB(pdb_file) residues = structure.iterResidues() residue_dict = {} @@ -373,6 +370,7 @@ def process_data(mapping_file, pdb_folder, interaction_func, bond_type, fixer): mapping_num = filtered_mapping.astype(float) # Load the one-letter amino acid codes from model1.pdb + import os pdb_model_path = os.path.join(pdb_folder, 'model1.pdb') residue_dict = load_residues_from_pdb(pdb_model_path) @@ -488,6 +486,7 @@ def process_data(mapping_file, pdb_folder, interaction_func, bond_type, fixer): def plot_barh(result, bond_type, n_per_plot=None, min_height=8): """Plot horizontal bar plots of percentages, splitting the data into fixed-sized plots.""" + import matplotlib.pylab as plt plt.rcParams.update({'font.size': 20}) # Set default value for n_per_plot if None is passed @@ -3675,6 +3674,7 @@ def calcSignatureInteractions(mapping_file, PDB_folder, fixer='pdbfixer'): :arg fixer: The method for fixing lack of hydrogen bonds :type fixer: 'pdbfixer' or 'openbabel' """ + import os functions = { From 9b79bd7b458ab1a0368efe967a4aba29936d3412 Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Tue, 12 Nov 2024 21:36:51 +0100 Subject: [PATCH 16/30] kwargs for plot_barh --- prody/proteins/interactions.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index ee2b55f9a..b81a35c1e 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -484,12 +484,24 @@ def process_data(mapping_file, pdb_folder, interaction_func, bond_type, fixer): return output_data, fixed_files # Return fixed_files to remove later -def plot_barh(result, bond_type, n_per_plot=None, min_height=8): - """Plot horizontal bar plots of percentages, splitting the data into fixed-sized plots.""" +def plot_barh(result, bond_type, **kwargs): + """Plot horizontal bar plots of percentages of interactions, splitting the data into fixed-sized plots. + + :arg n_per_plot: The number of results per one plot + by default is 20 if set to None + :type n_per_plot: int + + :arg min_height: Size of the bar plot + by default is 8 + :type min_height: int + """ + import matplotlib.pylab as plt plt.rcParams.update({'font.size': 20}) - # Set default value for n_per_plot if None is passed + n_per_plot = kwargs.pop('n_per_plot', None) + min_height = kwargs.pop('min_height', 8) + if n_per_plot is None: n_per_plot = 20 From 6e342c44c40c30282e3ec28098539fbd57432c7b Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Tue, 12 Nov 2024 21:45:33 +0100 Subject: [PATCH 17/30] fixing typos --- prody/proteins/interactions.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index b81a35c1e..c00da0121 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -3330,6 +3330,7 @@ def extractMultiModelPDB(multimodelPDB, **kwargs): :arg folder_name: The name of the folder to which PDBs will be extracted :type folder_name: str """ + import os folder_name = kwargs.pop('folder_name', 'struc_homologs') @@ -3389,7 +3390,7 @@ def runFoldseek(pdb_file, chain, **kwargs): :type tm_threshold: float :arg database_folder: Folder with the database - by default '~/Donwload/foldseek/pdb' + by default '~/Downloads/foldseek/pdb' To download the database use: 'foldseek databases PDB pdb tmp' in the console :type database_folder: str """ @@ -3399,7 +3400,7 @@ def runFoldseek(pdb_file, chain, **kwargs): import re import sys - database_folder = kwargs.pop('database_folder', '~/Donwload/foldseek/pdb') + database_folder = kwargs.pop('database_folder', '~/Downloads/foldseek/pdb') coverage_threshold = kwargs.pop('coverage_threshold', 0.3) tm_threshold = kwargs.pop('tm_threshold', 0.5) From 5450f8caa2ff14a62a1bf18184aecba8c185f53d Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Tue, 12 Nov 2024 23:25:09 +0100 Subject: [PATCH 18/30] kwargs for calcSignatureInteractions, optional removal of tmp files --- prody/proteins/interactions.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index c00da0121..fd6124a3d 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -3674,7 +3674,7 @@ def extract_sequence_from_pdb(pdb_file, chain, output_file): subprocess.run("rm -rf tmp2 temp", shell=True) -def calcSignatureInteractions(mapping_file, PDB_folder, fixer='pdbfixer'): +def calcSignatureInteractions(mapping_file, PDB_folder, **kwargs): """Analyzes protein structures to identify various interactions using InSty. Processes data from the MSA file and folder with selected models. @@ -3685,11 +3685,21 @@ def calcSignatureInteractions(mapping_file, PDB_folder, fixer='pdbfixer'): :type PDB_folder: str :arg fixer: The method for fixing lack of hydrogen bonds + by default is 'pdbfixer' :type fixer: 'pdbfixer' or 'openbabel' + + :arg remove_tmp_files: Removing intermediate files that are created in the process + by default is True + :type remove_tmp_files: True or False """ import os + fixer = kwargs.pop('fixer', 'pdbfixer') + remove_tmp_files = kwargs.pop('remove_tmp_files', True) + n_per_plot = kwargs.pop('n_per_plot', None) + min_height = kwargs.pop('min_height', 8) + functions = { "HydrogenBonds": calcHydrogenBonds, "SaltBridges": calcSaltBridges, @@ -3719,15 +3729,15 @@ def calcSignatureInteractions(mapping_file, PDB_folder, fixer='pdbfixer'): result, fixed_files = result # Proceed with plotting - n = None - plot_barh(result, bond_type, n) + plot_barh(result, bond_type, n_per_plot=n_per_plot, min_height=min_height) # Remove all fixed files at the end - if 'fixed_files' in locals(): - for fixed_file in fixed_files: - if os.path.exists(fixed_file): - os.remove(fixed_file) - log_message("Removed fixed file: {}".format(fixed_file)) + if remove_tmp_files == True: + if 'fixed_files' in locals(): + for fixed_file in fixed_files: + if os.path.exists(fixed_file): + os.remove(fixed_file) + log_message("Removed fixed file: {}".format(fixed_file)) From 1ec47c5993d05c2be42e3977a1ea1f8143a35539 Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Wed, 13 Nov 2024 13:41:13 +0100 Subject: [PATCH 19/30] if for 'fixed_files' removed --- prody/proteins/interactions.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index fd6124a3d..c80140689 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -3733,11 +3733,10 @@ def calcSignatureInteractions(mapping_file, PDB_folder, **kwargs): # Remove all fixed files at the end if remove_tmp_files == True: - if 'fixed_files' in locals(): - for fixed_file in fixed_files: - if os.path.exists(fixed_file): - os.remove(fixed_file) - log_message("Removed fixed file: {}".format(fixed_file)) + for fixed_file in fixed_files: + if os.path.exists(fixed_file): + os.remove(fixed_file) + log_message("Removed fixed file: {}".format(fixed_file)) From 6d348784b93d482231a3dd43662ab2f549429672 Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Wed, 13 Nov 2024 14:35:53 +0100 Subject: [PATCH 20/30] 3to1 letter code is not replaced by AAMAP --- prody/proteins/interactions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index c80140689..e5d2050b8 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -333,7 +333,7 @@ def is_command_installed(command): def load_residues_from_pdb(pdb_file): """Extract residue numbers and their corresponding one-letter amino acid codes from a PDB file.""" - from Bio.PDB.Polypeptide import three_to_one + from prody.atomic.atomic import AAMAP structure = parsePDB(pdb_file) residues = structure.iterResidues() residue_dict = {} @@ -342,7 +342,7 @@ def load_residues_from_pdb(pdb_file): resnum = res.getResnum() resname = res.getResname() # Three-letter amino acid code try: - one_letter_code = three_to_one(resname) # Convert to one-letter code + one_letter_code = AAMAP[resname] residue_dict[resnum] = one_letter_code except KeyError: log_message("Unknown residue: {} at position {}".format(resname, resnum), "WARNING") From ee64b51fb957f0f3109cf6aacf63869a6f979c2d Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Wed, 13 Nov 2024 14:59:56 +0100 Subject: [PATCH 21/30] Checks for database_folder exists --- prody/proteins/interactions.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index e5d2050b8..9cc43962a 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -3404,6 +3404,10 @@ def runFoldseek(pdb_file, chain, **kwargs): coverage_threshold = kwargs.pop('coverage_threshold', 0.3) tm_threshold = kwargs.pop('tm_threshold', 0.5) + full_path = os.path.expanduser(database_folder) + if not os.path.exists(full_path.strip('pdb')): + raise ValueError('The required database is not found in {0}. Please download it first.'.format(database_folder.strip('pdb'))) + # Define the amino acid conversion function def aa_onelet(three_letter_code): codes = { From 8c19c0fbb067d8ed96625e9f56adacdd58b65c94 Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Wed, 13 Nov 2024 15:13:34 +0100 Subject: [PATCH 22/30] additional checks and docs improvements --- prody/proteins/interactions.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index 9cc43962a..629898632 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -3374,6 +3374,9 @@ def runFoldseek(pdb_file, chain, **kwargs): https://github.com/steineggerlab/foldseek?tab=readme-ov-file#databasesand This function will not work under Windows. + Example usage: runFoldseek('5kqm.pdb', 'A', database_folder='~/Downloads/foldseek/pdb') + where previous a folder called 'foldseek' were created and PDB database was uploaded using: + >>> foldseek databases PDB pdb tmp (Linux console) :arg pdb_file: A PDB file path :type pdb_file: str @@ -3404,6 +3407,9 @@ def runFoldseek(pdb_file, chain, **kwargs): coverage_threshold = kwargs.pop('coverage_threshold', 0.3) tm_threshold = kwargs.pop('tm_threshold', 0.5) + if not isinstance(pdb_file, str): + raise TypeError('Please provide the name of the PDB file.') + full_path = os.path.expanduser(database_folder) if not os.path.exists(full_path.strip('pdb')): raise ValueError('The required database is not found in {0}. Please download it first.'.format(database_folder.strip('pdb'))) @@ -3682,6 +3688,8 @@ def calcSignatureInteractions(mapping_file, PDB_folder, **kwargs): """Analyzes protein structures to identify various interactions using InSty. Processes data from the MSA file and folder with selected models. + Example usage: calcSignatureInteractions('shortlisted_resind.msa','./struc_homologs') + :arg mapping_file: Aligned residue indices, MSA file type :type mapping_file: str From 22ff338198a530dff27720475421ad96628b26a7 Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Sat, 16 Nov 2024 13:55:17 +0100 Subject: [PATCH 23/30] runDali added --- prody/proteins/interactions.py | 101 ++++++++++++++++++++++++++++++++- 1 file changed, 100 insertions(+), 1 deletion(-) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index 629898632..f799f3398 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -33,6 +33,8 @@ import multiprocessing from .fixer import * +from .compare import * +from prody.measure import calcTransformation, calcDistance, calcRMSD, superpose __all__ = ['calcHydrogenBonds', 'calcChHydrogenBonds', 'calcSaltBridges', @@ -49,7 +51,7 @@ 'Interactions', 'InteractionsTrajectory', 'LigandInteractionsTrajectory', 'calcSminaBindingAffinity', 'calcSminaPerAtomInteractions', 'calcSminaTermValues', 'showSminaTermValues', 'showPairEnergy', 'checkNonstandardResidues', - 'saveInteractionsAsDummyAtoms', 'createFoldseekAlignment', 'runFoldseek', + 'saveInteractionsAsDummyAtoms', 'createFoldseekAlignment', 'runFoldseek', 'runDali', 'extractMultiModelPDB', 'calcSignatureInteractions'] @@ -3683,6 +3685,103 @@ def extract_sequence_from_pdb(pdb_file, chain, output_file): subprocess.run("rm -f inp.pdb prot.seq target.pdb temp_coord.txt temp_resind.txt temp_seq.txt", shell=True) subprocess.run("rm -rf tmp2 temp", shell=True) + +def runDali(pdb, chain, **kwargs): + """This function calls searchDali() and downloads all the PDB files, separate by chains, add hydrogens and missing side chains, + and finally align them and put into the newly created folder. + + :arg pdb: A PDB code + :type pdb: str + + :arg chain: chain identifier + :type chain: str + + :arg cutoff_len: Length of aligned residues < cutoff_len (must be an integer or a float between 0 and 1) + See searchDali for more details + by default 0.5 + :type cutoff_len: float + + :arg cutoff_rmsd: RMSD cutoff (see searchDali) + by default 1.0 + :type cutoff_rmsd: float + + :arg subsetDali: fullPDB, PDB25, PDB50, PDB90 + by default is 'fullPDB' + :type subsetDali: str + + :arg fixer: The method for fixing lack of hydrogen bonds + by default is 'pdbfixer' + :type fixer: 'pdbfixer' or 'openbabel' + + :arg subset: subsets of atoms: 'ca', 'bb', 'heavy', 'noh', 'all' (see matchChains()) + by default is 'ca' + :type subset: str + + :arg seqid: Minimum value of the sequence identity (see matchChains()) + by default 5 + :type seqid: float + + :arg overlap: percent overlap (see matchChains()) + by default 50 + :type overlap: float + + :arg folder_name: Folder where the results will be collected + by default is 'struc_homologs' + :type folder_name: str + """ + + import os + import shutil + from prody.database import dali + + cutoff_len = kwargs.pop('cutoff_len', 0.5) + cutoff_rmsd = kwargs.pop('cutoff_rmsd', 1.0) + fixer = kwargs.pop('fixer', 'pdbfixer') + subset_Dali = kwargs.pop('subset_Dali', 'fullPDB') + subset = kwargs.pop('subset', 'ca') + seqid = kwargs.pop('seqid', 5) + overlap = kwargs.pop('overlap', 50) + folder_name = kwargs.pop('folder_name', 'struc_homologs') + + dali_rec = dali.searchDali(pdb, chain, subset=subset_Dali) + + while not dali_rec.isSuccess: + dali_rec.fetch() + + pdb_ids = dali_rec.filter(cutoff_len=cutoff_len, cutoff_rmsd=cutoff_rmsd) + pdb_hits = [ (i[:4], i[4:]) for i in pdb_ids ] + + list_pdbs = [] + LOGGER.info('Separating chains and saving into PDB file') + for i in pdb_hits: + LOGGER.info('PDB code {} and chain {}'.format(i[0], i[1])) + p = parsePDB(i[0]).select('chain '+i[1]+' and protein') + writePDB(i[0]+i[1]+'.pdb', p) + list_pdbs.append(i[0]+i[1]+'.pdb') + + #LOGGER.info('Number of selected structures: ', len(list_pdbs)) + LOGGER.info('Adding hydrogens to the structures..') + new_pdbids = fixStructuresMissingAtoms(list_pdbs, method='pdbfixer', model_residues=True, overwrite=True) + + os.makedirs(folder_name) + structures = parsePDB(new_pdbids) + target = structures[0] + rmsds = [] + for mobile in structures[1:]: + try: + LOGGER.info('Aligning the structures..') + i = mobile.getTitle() + LOGGER.info(i) + matches = matchChains(mobile.protein, target.protein, subset=subset, seqid=seqid, overlap=overlap) + m = matches[0] + m0_alg, T = superpose(m[0], m[1], weights=m[0].getFlags("mapped")) + rmsds.append(calcRMSD(m[0], m[1], weights=m[0].getFlags("mapped"))) + source_file = 'align__'+i+'.pdb' + writePDB(source_file, mobile) + shutil.move(source_file, os.path.join(folder_name, os.path.basename(source_file))) + except TypeError: + raise TypeError('There is a problem with {}. Change seqid or overlap parameter to include the structure.'.format(i)) + def calcSignatureInteractions(mapping_file, PDB_folder, **kwargs): """Analyzes protein structures to identify various interactions using InSty. From 8a0019d491529c9fc1381cb0baa0f4e03da1fee1 Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Sat, 16 Nov 2024 18:22:43 +0100 Subject: [PATCH 24/30] TypeError replaced by LOGGER.warn in runDali --- prody/proteins/interactions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index 7ef226354..afb053622 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -3781,8 +3781,8 @@ def runDali(pdb, chain, **kwargs): source_file = 'align__'+i+'.pdb' writePDB(source_file, mobile) shutil.move(source_file, os.path.join(folder_name, os.path.basename(source_file))) - except TypeError: - raise TypeError('There is a problem with {}. Change seqid or overlap parameter to include the structure.'.format(i)) + except: + LOGGER.warn('There is a problem with {}. Change seqid or overlap parameter to include the structure.'.format(i)) def calcSignatureInteractions(mapping_file, PDB_folder, **kwargs): From f57418e1f14c3529f5c6905f9c2faf3cd87b52e2 Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Tue, 19 Nov 2024 08:31:29 +0100 Subject: [PATCH 25/30] runBLAST form SignInter added --- prody/proteins/interactions.py | 87 +++++++++++++++++++++++++++++++++- 1 file changed, 86 insertions(+), 1 deletion(-) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index afb053622..6ae348761 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -52,7 +52,7 @@ 'calcSminaBindingAffinity', 'calcSminaPerAtomInteractions', 'calcSminaTermValues', 'showSminaTermValues', 'showPairEnergy', 'checkNonstandardResidues', 'saveInteractionsAsDummyAtoms', 'createFoldseekAlignment', 'runFoldseek', 'runDali', - 'extractMultiModelPDB', 'calcSignatureInteractions'] + 'runBLAST', 'extractMultiModelPDB', 'calcSignatureInteractions'] def cleanNumbers(listContacts): @@ -3783,7 +3783,92 @@ def runDali(pdb, chain, **kwargs): shutil.move(source_file, os.path.join(folder_name, os.path.basename(source_file))) except: LOGGER.warn('There is a problem with {}. Change seqid or overlap parameter to include the structure.'.format(i)) + + +def runBLAST(pdb, chain, **kwargs): + """This function calls blastPDB to find homologs and downloads all of them in PDB format to the local directory, + separate chains that were identified by BLAST, add hydrogens and missing side chains, + and finally align them and put into the newly created folder. + + :arg pdb: A PDB code + :type pdb: str + + :arg chain: chain identifier + :type chain: str + + :arg fixer: The method for fixing lack of hydrogen bonds + by default is 'pdbfixer' + :type fixer: 'pdbfixer' or 'openbabel' + + :arg subset: subsets of atoms: 'ca', 'bb', 'heavy', 'noh', 'all' (see matchChains()) + by default is 'bb' + :type subset: str + + :arg seqid: Minimum value of the sequence identity (see matchChains()) + by default 90 + :type seqid: float + + :arg overlap: percent overlap (see matchChains()) + by default 50 + :type overlap: float + + :arg folder_name: Folder where the results will be collected + by default is 'struc_homologs' + :type folder_name: str + """ + + import os + import shutil + from prody.proteins.blastpdb import blastPDB + + fixer = kwargs.pop('fixer', 'pdbfixer') + seqid = kwargs.pop('seqid', 90) + overlap = kwargs.pop('overlap', 50) + subset = kwargs.pop('subset', 'bb') + folder_name = kwargs.pop('folder_name', 'struc_homologs') + + ref_prot = parsePDB(pdb) + ref_hv = ref_prot.getHierView()[chain] + sequence = ref_hv.getSequence() + + blast_record = blastPDB(sequence) + while not blast_record.isSuccess: + blast_record.fetch() + + pdb_hits = [] + for key, item in blast_record.getHits(seqid).items(): + pdb_hits.append((key, item['chain_id'])) + + list_pdbs = [] + LOGGER.info('Separating chains and saving into PDB file') + for i in pdb_hits: + LOGGER.info('PDB code {} and chain {}'.format(i[0], i[1])) + p = parsePDB(i[0]).select('chain '+i[1]+' and protein') + writePDB(i[0]+i[1]+'.pdb', p) + list_pdbs.append(i[0]+i[1]+'.pdb') + LOGGER.info('Adding hydrogens to the structures..') + new_pdbids = fixStructuresMissingAtoms(list_pdbs, method='pdbfixer', model_residues=True, overwrite=True) + + os.makedirs(folder_name) + structures = parsePDB(new_pdbids) + target = structures[0] + rmsds = [] + for mobile in structures[1:]: + try: + LOGGER.info('Aligning the structures..') + i = mobile.getTitle() + LOGGER.info(i) + matches = matchChains(mobile.protein, target.protein, subset=subset, seqid=seqid, overlap=overlap) + m = matches[0] + m0_alg, T = superpose(m[0], m[1], weights=m[0].getFlags("mapped")) + rmsds.append(calcRMSD(m[0], m[1], weights=m[0].getFlags("mapped"))) + source_file = 'align__'+i+'.pdb' + writePDB(source_file, mobile) + shutil.move(source_file, os.path.join(folder_name, os.path.basename(source_file))) + except: + LOGGER.warn('There is a problem with {}. Change seqid or overlap parameter to include the structure.'.format(i)) + def calcSignatureInteractions(mapping_file, PDB_folder, **kwargs): """Analyzes protein structures to identify various interactions using InSty. From 50a37d6e39cbc3ee98206456c520e3f76923aefe Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Fri, 22 Nov 2024 22:47:25 +0100 Subject: [PATCH 26/30] calcSignatureInteractions includes now analysis with MSA file [further reorganization needed] --- prody/proteins/interactions.py | 112 ++++++++++++++++++++------------- 1 file changed, 70 insertions(+), 42 deletions(-) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index 6ae348761..0e0c74a67 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -3870,18 +3870,21 @@ def runBLAST(pdb, chain, **kwargs): LOGGER.warn('There is a problem with {}. Change seqid or overlap parameter to include the structure.'.format(i)) -def calcSignatureInteractions(mapping_file, PDB_folder, **kwargs): +def calcSignatureInteractions(PDB_folder, **kwargs): """Analyzes protein structures to identify various interactions using InSty. Processes data from the MSA file and folder with selected models. - Example usage: calcSignatureInteractions('shortlisted_resind.msa','./struc_homologs') - - :arg mapping_file: Aligned residue indices, MSA file type - :type mapping_file: str + Example usage: + >>> calcSignatureInteractions('./struc_homologs') + >>> calcSignatureInteractions('./struc_homologs', mapping_file='shortlisted_resind.msa') :arg PDB_folder: Directory containing PDB model files :type PDB_folder: str + :arg mapping_file: Aligned residue indices, MSA file type + required in Foldseek analyisis + :type mapping_file: str + :arg fixer: The method for fixing lack of hydrogen bonds by default is 'pdbfixer' :type fixer: 'pdbfixer' or 'openbabel' @@ -3892,50 +3895,75 @@ def calcSignatureInteractions(mapping_file, PDB_folder, **kwargs): """ import os - - fixer = kwargs.pop('fixer', 'pdbfixer') - remove_tmp_files = kwargs.pop('remove_tmp_files', True) - n_per_plot = kwargs.pop('n_per_plot', None) - min_height = kwargs.pop('min_height', 8) - - functions = { - "HydrogenBonds": calcHydrogenBonds, - "SaltBridges": calcSaltBridges, - "RepulsiveIonicBonding": calcRepulsiveIonicBonding, - "PiStacking": calcPiStacking, - "PiCation": calcPiCation, - "Hydrophobic": calcHydrophobic, - "DisulfideBonds": calcDisulfideBonds - } + mapping_file = kwargs.get('mapping_file') + + if not mapping_file: + # Dali and Blast approach + align_files = [os.path.join(PDB_folder, file) for file in os.listdir(PDB_folder)] + + functions = { + "HBs": calcHydrogenBonds, + "SBs": calcSaltBridges, + "RIB": calcRepulsiveIonicBonding, + "PiStack": calcPiStacking, + "PiCat": calcPiCation, + "HPh": calcHydrophobic, + "DiBs": calcDisulfideBonds + } - # Process each bond type - for bond_type, func in functions.items(): - # Check if the consensus file already exists - consensus_file = '{}_consensus.txt'.format(bond_type) - if os.path.exists(consensus_file): - log_message("Consensus file for {} already exists, skipping.".format(bond_type), "INFO") - continue + for bond_type, func in functions.items(): + for file in align_files: + try: + atoms = parsePDB(file) + interactions = func(atoms.select('protein')) + saveInteractionsAsDummyAtoms(atoms, interactions, filename ='INT_'+bond_type+'_'+file) + except: pass + + else: + # Foldseek approach + mapping_file = kwargs.pop('mapping_file', 'shortlisted_resind.msa') + fixer = kwargs.pop('fixer', 'pdbfixer') + remove_tmp_files = kwargs.pop('remove_tmp_files', True) + n_per_plot = kwargs.pop('n_per_plot', None) + min_height = kwargs.pop('min_height', 8) + + functions = { + "HydrogenBonds": calcHydrogenBonds, + "SaltBridges": calcSaltBridges, + "RepulsiveIonicBonding": calcRepulsiveIonicBonding, + "PiStacking": calcPiStacking, + "PiCation": calcPiCation, + "Hydrophobic": calcHydrophobic, + "DisulfideBonds": calcDisulfideBonds + } - log_message("Processing {}".format(bond_type)) - result = process_data(mapping_file, PDB_folder, func, bond_type, fixer) + # Process each bond type + for bond_type, func in functions.items(): + # Check if the consensus file already exists + consensus_file = '{}_consensus.txt'.format(bond_type) + if os.path.exists(consensus_file): + log_message("Consensus file for {} already exists, skipping.".format(bond_type), "INFO") + continue - # Check if the result is None (no valid bonds found) - if result is None: - log_message("No valid {} entries found, skipping further processing.".format(bond_type), "WARNING") - continue + log_message("Processing {}".format(bond_type)) + result = process_data(mapping_file, PDB_folder, func, bond_type, fixer) - result, fixed_files = result + # Check if the result is None (no valid bonds found) + if result is None: + log_message("No valid {} entries found, skipping further processing.".format(bond_type), "WARNING") + continue - # Proceed with plotting - plot_barh(result, bond_type, n_per_plot=n_per_plot, min_height=min_height) + result, fixed_files = result - # Remove all fixed files at the end - if remove_tmp_files == True: - for fixed_file in fixed_files: - if os.path.exists(fixed_file): - os.remove(fixed_file) - log_message("Removed fixed file: {}".format(fixed_file)) + # Proceed with plotting + plot_barh(result, bond_type, n_per_plot=n_per_plot, min_height=min_height) + # Remove all fixed files at the end + if remove_tmp_files == True: + for fixed_file in fixed_files: + if os.path.exists(fixed_file): + os.remove(fixed_file) + log_message("Removed fixed file: {}".format(fixed_file)) class Interactions(object): From 566d340bedccc9dc2c4b54343dfc7115391f263d Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Wed, 27 Nov 2024 22:15:33 +0100 Subject: [PATCH 27/30] reoeganization of runFoldseek and calcSignatureInteractions --- prody/proteins/interactions.py | 113 +++++++++++++++++++++++---------- 1 file changed, 79 insertions(+), 34 deletions(-) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index 0e0c74a67..d0291dcea 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -3400,6 +3400,26 @@ def runFoldseek(pdb_file, chain, **kwargs): by default '~/Downloads/foldseek/pdb' To download the database use: 'foldseek databases PDB pdb tmp' in the console :type database_folder: str + + :arg fixer: The method for fixing lack of hydrogen bonds + by default is 'pdbfixer' + :type fixer: 'pdbfixer' or 'openbabel' + + :arg subset: subsets of atoms: 'ca', 'bb', 'heavy', 'noh', 'all' (see matchChains()) + by default is 'ca' + :type subset: str + + :arg seqid: Minimum value of the sequence identity (see matchChains()) + by default 5 + :type seqid: float + + :arg overlap: percent overlap (see matchChains()) + by default 50 + :type overlap: float + + :arg folder_name: Folder where the results will be collected + by default is 'struc_homologs' + :type folder_name: str """ import os @@ -3410,6 +3430,10 @@ def runFoldseek(pdb_file, chain, **kwargs): database_folder = kwargs.pop('database_folder', '~/Downloads/foldseek/pdb') coverage_threshold = kwargs.pop('coverage_threshold', 0.3) tm_threshold = kwargs.pop('tm_threshold', 0.5) + folder_name = kwargs.pop('folder_name', 'struc_homologs') + subset = kwargs.pop('subset', 'ca') + seqid = kwargs.pop('seqid', 5) + overlap = kwargs.pop('overlap', 50) if not isinstance(pdb_file, str): raise TypeError('Please provide the name of the PDB file.') @@ -3683,9 +3707,34 @@ def extract_sequence_from_pdb(pdb_file, chain, output_file): fp9.close() fp10.close() - extractMultiModelPDB('aligned_structures.pdb', folder_name='struc_homologs') + extractMultiModelPDB('aligned_structures.pdb', folder_name=folder_name) subprocess.run("rm -f inp.pdb prot.seq target.pdb temp_coord.txt temp_resind.txt temp_seq.txt", shell=True) subprocess.run("rm -rf tmp2 temp", shell=True) + + # New part + pwd = os.path.abspath(folder_name) + list_pdbs = [pwd+'/'+ff for ff in os.listdir(pwd) if ff.endswith('.pdb')] + list_pdbs.sort(key=lambda x: int(''.join(filter(str.isdigit, x)))) # all structures will be aligned on model1.pdb as in the oryginal code + + LOGGER.info('Adding hydrogens to the structures..') + new_pdbids = fixStructuresMissingAtoms(list_pdbs, method='pdbfixer', model_residues=True, overwrite=True) + + structures = parsePDB(new_pdbids) + target = structures[0] + rmsds = [] + for mobile in structures[1:]: + try: + LOGGER.info('Aligning the structures..') + i = mobile.getTitle() + LOGGER.info(i) + matches = matchChains(mobile.protein, target.protein, subset=subset, seqid=seqid, overlap=overlap) + m = matches[0] + m0_alg, T = superpose(m[0], m[1], weights=m[0].getFlags("mapped")) + rmsds.append(calcRMSD(m[0], m[1], weights=m[0].getFlags("mapped"))) + source_file = pwd+'/'+'align__'+i+'.pdb' + writePDB(source_file, mobile) + except: + LOGGER.warn('There is a problem with {}. Change seqid or overlap parameter to include the structure.'.format(i)) def runDali(pdb, chain, **kwargs): @@ -3761,7 +3810,6 @@ def runDali(pdb, chain, **kwargs): writePDB(i[0]+i[1]+'.pdb', p) list_pdbs.append(i[0]+i[1]+'.pdb') - #LOGGER.info('Number of selected structures: ', len(list_pdbs)) LOGGER.info('Adding hydrogens to the structures..') new_pdbids = fixStructuresMissingAtoms(list_pdbs, method='pdbfixer', model_residues=True, overwrite=True) @@ -3881,6 +3929,10 @@ def calcSignatureInteractions(PDB_folder, **kwargs): :arg PDB_folder: Directory containing PDB model files :type PDB_folder: str + :arg use_prefix: Whether to use perfix to select particular file names in the PDB_folder + Default is True + :type use_prefix: bool + :arg mapping_file: Aligned residue indices, MSA file type required in Foldseek analyisis :type mapping_file: str @@ -3888,42 +3940,42 @@ def calcSignatureInteractions(PDB_folder, **kwargs): :arg fixer: The method for fixing lack of hydrogen bonds by default is 'pdbfixer' :type fixer: 'pdbfixer' or 'openbabel' - - :arg remove_tmp_files: Removing intermediate files that are created in the process - by default is True - :type remove_tmp_files: True or False """ import os mapping_file = kwargs.get('mapping_file') + use_prefix = kwargs.pop('use_prefix', True) - if not mapping_file: - # Dali and Blast approach + if use_prefix == True: + align_files = [os.path.join(PDB_folder, file) for file in os.listdir(PDB_folder) if file.startswith("align_")] + + else: align_files = [os.path.join(PDB_folder, file) for file in os.listdir(PDB_folder)] - functions = { - "HBs": calcHydrogenBonds, - "SBs": calcSaltBridges, - "RIB": calcRepulsiveIonicBonding, - "PiStack": calcPiStacking, - "PiCat": calcPiCation, - "HPh": calcHydrophobic, - "DiBs": calcDisulfideBonds - } + functions_dict = { + "HBs": calcHydrogenBonds, + "SBs": calcSaltBridges, + "RIB": calcRepulsiveIonicBonding, + "PiStack": calcPiStacking, + "PiCat": calcPiCation, + "HPh": calcHydrophobic, + "DiBs": calcDisulfideBonds + } - for bond_type, func in functions.items(): - for file in align_files: - try: - atoms = parsePDB(file) - interactions = func(atoms.select('protein')) - saveInteractionsAsDummyAtoms(atoms, interactions, filename ='INT_'+bond_type+'_'+file) - except: pass + for bond_type, func in functions_dict.items(): + for file in align_files: + try: + LOGGER.info(file) + atoms = parsePDB(file) + interactions = func(atoms.select('protein')) + saveInteractionsAsDummyAtoms(atoms, interactions, filename=file.rstrip(file.split('/')[-1])+'INT_'+bond_type+'_'+file.split('/')[-1]) + except: pass - else: - # Foldseek approach + + if mapping_file: + # for MSA file (Foldseek) mapping_file = kwargs.pop('mapping_file', 'shortlisted_resind.msa') fixer = kwargs.pop('fixer', 'pdbfixer') - remove_tmp_files = kwargs.pop('remove_tmp_files', True) n_per_plot = kwargs.pop('n_per_plot', None) min_height = kwargs.pop('min_height', 8) @@ -3958,13 +4010,6 @@ def calcSignatureInteractions(PDB_folder, **kwargs): # Proceed with plotting plot_barh(result, bond_type, n_per_plot=n_per_plot, min_height=min_height) - # Remove all fixed files at the end - if remove_tmp_files == True: - for fixed_file in fixed_files: - if os.path.exists(fixed_file): - os.remove(fixed_file) - log_message("Removed fixed file: {}".format(fixed_file)) - class Interactions(object): From 12841bbd3a117164610221951b9f949624e648e1 Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Thu, 28 Nov 2024 12:21:30 +0100 Subject: [PATCH 28/30] docs fix [InSty] --- prody/proteins/interactions.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index d0291dcea..99fe82bf4 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -3749,35 +3749,35 @@ def runDali(pdb, chain, **kwargs): :arg cutoff_len: Length of aligned residues < cutoff_len (must be an integer or a float between 0 and 1) See searchDali for more details - by default 0.5 + Default is 0.5 :type cutoff_len: float :arg cutoff_rmsd: RMSD cutoff (see searchDali) - by default 1.0 + Default is 1.0 :type cutoff_rmsd: float - :arg subsetDali: fullPDB, PDB25, PDB50, PDB90 - by default is 'fullPDB' - :type subsetDali: str + :arg subset_Dali: fullPDB, PDB25, PDB50, PDB90 + Default is 'fullPDB' + :type subset_Dali: str :arg fixer: The method for fixing lack of hydrogen bonds - by default is 'pdbfixer' + Default is 'pdbfixer' :type fixer: 'pdbfixer' or 'openbabel' :arg subset: subsets of atoms: 'ca', 'bb', 'heavy', 'noh', 'all' (see matchChains()) - by default is 'ca' + Default is 'ca' :type subset: str :arg seqid: Minimum value of the sequence identity (see matchChains()) - by default 5 + Default 5 :type seqid: float :arg overlap: percent overlap (see matchChains()) - by default 50 + Default 50 :type overlap: float :arg folder_name: Folder where the results will be collected - by default is 'struc_homologs' + Default is 'struc_homologs' :type folder_name: str """ From c7d4a9979c3dc7d6554840c060d5e1ba7174a77b Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Sun, 1 Dec 2024 21:07:30 +0100 Subject: [PATCH 29/30] final rearrangement of calcSignatureInteractions with MSA file --- prody/proteins/interactions.py | 76 +++++++++++----------------------- 1 file changed, 24 insertions(+), 52 deletions(-) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index 99fe82bf4..b4c749ee9 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -356,7 +356,7 @@ def append_residue_code(residue_num, residue_dict): aa_code = residue_dict.get(residue_num, "X") # Use "X" for unknown residues return "{}{}".format(aa_code, residue_num) -def process_data(mapping_file, pdb_folder, interaction_func, bond_type, fixer): +def process_data(mapping_file, pdb_folder, interaction_func, bond_type, files_for_analysis): """Process the mapping file and the PDB folder to compute interaction counts and percentages.""" log_message("Loading mapping file: {}".format(mapping_file)) @@ -382,33 +382,14 @@ def process_data(mapping_file, pdb_folder, interaction_func, bond_type, fixer): processed_files = 0 # To track the number of files successfully processed fixed_files = [] # Store paths of fixed files to remove at the end - for i, files in enumerate(os.listdir(pdb_folder)): - # Skip any file that has already been processed (files with 'addH_' prefix) - if files.startswith('addH_'): - log_message("Skipping already fixed file: {}".format(files), "INFO") - continue - + for i, files in enumerate(files_for_analysis): log_message("Processing file {}: {}".format(i + 1, files)) - pdb_file_path = os.path.join(pdb_folder, files) - fixed_pdb_path = pdb_file_path.replace(files, 'addH_' + files) - - # Check if the fixed file already exists, skip fixing if it does - if not os.path.exists(fixed_pdb_path): - try: - log_message("Running fixer on {} using {}.".format(pdb_file_path, fixer)) - addMissingAtoms(pdb_file_path, method=fixer) - except Exception as e: - log_message("Error adding missing atoms: {}".format(e), "ERROR") - continue - else: - log_message("Using existing fixed file: {}".format(fixed_pdb_path)) - try: - coords = parsePDB(fixed_pdb_path) + coords = parsePDB(files) atoms = coords.select('protein') - interactions = Interactions() bonds = interaction_func(atoms) + saveInteractionsAsDummyAtoms(atoms, bonds, filename=files.rstrip(files.split('/')[-1])+'INT_'+bond_type+'_'+files.split('/')[-1]) except Exception as e: log_message("Error processing PDB file {}: {}".format(files, e), "ERROR") continue @@ -419,7 +400,7 @@ def process_data(mapping_file, pdb_folder, interaction_func, bond_type, fixer): continue processed_files += 1 # Increment successfully processed files - fixed_files.append(fixed_pdb_path) + fixed_files.append(files) if processed_files == 1: # First valid file with bonds determines target indices for entries in bonds: @@ -3572,6 +3553,12 @@ def extract_sequence_from_pdb(pdb_file, chain, output_file): awk_command = "awk '{{if (substr($0, 22, 1) == \"{0}\") print}}'".format(chain) subprocess.run("cat ./temp/{0} | grep -E '^(ATOM|HETATM)' | {1} > target.pdb".format(fname, awk_command), shell=True) + # Check if target.pdb has fewer than 5 lines + if sum(1 for _ in open("target.pdb")) < 5: + LOGGER.info("target.pdb has fewer than 5 lines. Skipping further processing.") + subprocess.run(['rm', 'target.pdb']) + continue + with open('target.pdb', 'r') as target_file: file4 = target_file.readlines() @@ -3936,10 +3923,6 @@ def calcSignatureInteractions(PDB_folder, **kwargs): :arg mapping_file: Aligned residue indices, MSA file type required in Foldseek analyisis :type mapping_file: str - - :arg fixer: The method for fixing lack of hydrogen bonds - by default is 'pdbfixer' - :type fixer: 'pdbfixer' or 'openbabel' """ import os @@ -3962,35 +3945,23 @@ def calcSignatureInteractions(PDB_folder, **kwargs): "DiBs": calcDisulfideBonds } - for bond_type, func in functions_dict.items(): - for file in align_files: - try: - LOGGER.info(file) - atoms = parsePDB(file) - interactions = func(atoms.select('protein')) - saveInteractionsAsDummyAtoms(atoms, interactions, filename=file.rstrip(file.split('/')[-1])+'INT_'+bond_type+'_'+file.split('/')[-1]) - except: pass - - + if not mapping_file: + for bond_type, func in functions_dict.items(): + for file in align_files: + try: + LOGGER.info(file) + atoms = parsePDB(file) + interactions = func(atoms.select('protein')) + saveInteractionsAsDummyAtoms(atoms, interactions, filename=file.rstrip(file.split('/')[-1])+'INT_'+bond_type+'_'+file.split('/')[-1]) + except: pass + if mapping_file: # for MSA file (Foldseek) - mapping_file = kwargs.pop('mapping_file', 'shortlisted_resind.msa') - fixer = kwargs.pop('fixer', 'pdbfixer') n_per_plot = kwargs.pop('n_per_plot', None) min_height = kwargs.pop('min_height', 8) - functions = { - "HydrogenBonds": calcHydrogenBonds, - "SaltBridges": calcSaltBridges, - "RepulsiveIonicBonding": calcRepulsiveIonicBonding, - "PiStacking": calcPiStacking, - "PiCation": calcPiCation, - "Hydrophobic": calcHydrophobic, - "DisulfideBonds": calcDisulfideBonds - } - # Process each bond type - for bond_type, func in functions.items(): + for bond_type, func in functions_dict.items(): # Check if the consensus file already exists consensus_file = '{}_consensus.txt'.format(bond_type) if os.path.exists(consensus_file): @@ -3998,7 +3969,7 @@ def calcSignatureInteractions(PDB_folder, **kwargs): continue log_message("Processing {}".format(bond_type)) - result = process_data(mapping_file, PDB_folder, func, bond_type, fixer) + result = process_data(mapping_file, PDB_folder, func, bond_type, files_for_analysis=align_files) # Check if the result is None (no valid bonds found) if result is None: @@ -4009,6 +3980,7 @@ def calcSignatureInteractions(PDB_folder, **kwargs): # Proceed with plotting plot_barh(result, bond_type, n_per_plot=n_per_plot, min_height=min_height) + class Interactions(object): From c159a34e2edf8eaa362a8f327e11855e62056dcd Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Sun, 1 Dec 2024 21:43:47 +0100 Subject: [PATCH 30/30] Changes in docs --- prody/proteins/interactions.py | 100 ++++++++++++++++----------------- 1 file changed, 48 insertions(+), 52 deletions(-) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index b4c749ee9..0cad12a09 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -471,11 +471,11 @@ def plot_barh(result, bond_type, **kwargs): """Plot horizontal bar plots of percentages of interactions, splitting the data into fixed-sized plots. :arg n_per_plot: The number of results per one plot - by default is 20 if set to None + Default is 20 if set to None :type n_per_plot: int :arg min_height: Size of the bar plot - by default is 8 + Default is 8 :type min_height: int """ @@ -614,15 +614,15 @@ def calcSASA(atoms, **kwargs): :type atoms: :class:`.Atomic` :arg selection: selection string - by default all the protein structure is used + Default all the protein structure is used :type selection: str :arg sasa_cutoff: cutoff for SASA values - default is 0.0 + Default is 0.0 :type sasa_cutoff: float, int :arg resnames: residues name included - default is False + Default is False :type resnames: bool """ @@ -672,19 +672,19 @@ def calcVolume(atoms, **kwargs): :type atoms: :class:`.Atomic` :arg selection: selection string - by default all the protein structure is used + Default all the protein structure is used :type selection: str :arg volume_cutoff: cutoff for volume - default is 0.0 to include all the results + Default is 0.0 to include all the results :type sasa_volume: float, int :arg split_residues: it will provide values for each residue - default is False + Default is False :type split_residues: bool :arg resnames: residues name included - default is False + Default is False True - will give residue names and values for each residue False - will give only the values for each residues :type resnames: bool @@ -2114,35 +2114,35 @@ def showInteractionsGraph(statistics, **kwargs): :type statistics: list :arg cutoff: Minimal number of counts per residue in the trajectory - by default 0.1. + Default is 0.1. :type cutoff: int, float :arg code: representation of the residues, 3-letter or 1-letter - by default 3-letter + Default is 3-letter :type code: str :arg multiple_chains: display chain name for structure with many chains - by default False + Default is False :type multiple_chains: bool :arg edge_cmap: color of the residue connection - by default plt.cm.Blues (blue color) + Default is plt.cm.Blues (blue color) :type edge_cmap: str :arg node_size: size of the nodes which describes residues - by default 300 + Default is 300 :type node_size: int :arg node_distance: value which will scale residue-residue interactions - by default 5 + Default is 5 :type node_distance: int :arg font_size: size of the font - by default 14 + Default is 14 :type font_size: int :arg seed: random number which affect the distribution of residues - by default 42 + Default is 42 :type seed: int """ @@ -2278,11 +2278,11 @@ def showInteractionsHist(statistics, **kwargs): :type statistics: list :arg clip: maxmimal number of residue pairs on the bar plot - by default 20 + Default is 20 :type clip: int :arg font_size: size of the font - by default 18 + Default is 18 :type font_size: int """ @@ -2584,7 +2584,7 @@ def listLigandInteractions(PLIP_output, **kwargs): :arg output: parameter to print the interactions on the screen while analyzing the structure (True | False) - by default is False + Default is False :type output: bool Note that five types of interactions are considered: hydrogen bonds, salt bridges, @@ -2966,23 +2966,23 @@ def calcSminaBindingAffinity(atoms, trajectory=None, **kwargs): :arg protein_selection: selection string for the protein and other compoment of the system that should be included, e.g. "protein and chain A", - by default "protein" + Default is "protein" :type protein_selection: str :arg ligand_selection: selection string for ligand, e.g. "resname ADP", - by default "all not protein" + Default is "all not protein" :type ligand_selection: str :arg ligand_selection: scoring function (vina or vinardo) - by default is "vina" + Default is "vina" :type ligand_selection: str :arg atom_terms: write per-atom interaction term values. It will provide more information as dictionary for each frame/model, and details for atom terms will be saved in terms_*frame_number*.txt, - by default is False + Default is False :type atom_terms: bool @@ -3370,38 +3370,37 @@ def runFoldseek(pdb_file, chain, **kwargs): :type chain: str :arg coverage_threshold: Coverage threshold - by default 0.3 + Default is 0.3 :type coverage_threshold: float :arg tm_threshold: TM-score threshold - by default 0.5 + Default is 0.5 :type tm_threshold: float :arg database_folder: Folder with the database - by default '~/Downloads/foldseek/pdb' + Default is '~/Downloads/foldseek/pdb' To download the database use: 'foldseek databases PDB pdb tmp' in the console :type database_folder: str :arg fixer: The method for fixing lack of hydrogen bonds - by default is 'pdbfixer' + Default is 'pdbfixer' :type fixer: 'pdbfixer' or 'openbabel' :arg subset: subsets of atoms: 'ca', 'bb', 'heavy', 'noh', 'all' (see matchChains()) - by default is 'ca' + Default is 'ca' :type subset: str :arg seqid: Minimum value of the sequence identity (see matchChains()) - by default 5 + Default is 5 :type seqid: float :arg overlap: percent overlap (see matchChains()) - by default 50 + Default is 50 :type overlap: float :arg folder_name: Folder where the results will be collected - by default is 'struc_homologs' - :type folder_name: str - """ + Default is 'struc_homologs' + :type folder_name: str """ import os import subprocess @@ -3756,17 +3755,16 @@ def runDali(pdb, chain, **kwargs): :type subset: str :arg seqid: Minimum value of the sequence identity (see matchChains()) - Default 5 + Default is 5 :type seqid: float :arg overlap: percent overlap (see matchChains()) - Default 50 + Default is 50 :type overlap: float :arg folder_name: Folder where the results will be collected Default is 'struc_homologs' - :type folder_name: str - """ + :type folder_name: str """ import os import shutil @@ -3832,25 +3830,24 @@ def runBLAST(pdb, chain, **kwargs): :type chain: str :arg fixer: The method for fixing lack of hydrogen bonds - by default is 'pdbfixer' + Default is 'pdbfixer' :type fixer: 'pdbfixer' or 'openbabel' :arg subset: subsets of atoms: 'ca', 'bb', 'heavy', 'noh', 'all' (see matchChains()) - by default is 'bb' + Default is 'bb' :type subset: str :arg seqid: Minimum value of the sequence identity (see matchChains()) - by default 90 + Default is 90 :type seqid: float :arg overlap: percent overlap (see matchChains()) - by default 50 + Default is 50 :type overlap: float :arg folder_name: Folder where the results will be collected - by default is 'struc_homologs' - :type folder_name: str - """ + Default is 'struc_homologs' + :type folder_name: str """ import os import shutil @@ -3922,8 +3919,7 @@ def calcSignatureInteractions(PDB_folder, **kwargs): :arg mapping_file: Aligned residue indices, MSA file type required in Foldseek analyisis - :type mapping_file: str - """ + :type mapping_file: str """ import os mapping_file = kwargs.get('mapping_file') @@ -4415,7 +4411,7 @@ def buildInteractionMatrixEnergy(self, **kwargs): """Build matrix with interaction energy comming from energy of pairs of specific residues. :arg energy_list_type: name of the list with energies - default is 'IB_solv' + Default is 'IB_solv' :type energy_list_type: 'IB_nosolv', 'IB_solv', 'CS' """ @@ -4578,7 +4574,7 @@ def getFrequentInteractors(self, contacts_min=2): (7) Disulfide bonds (disb) :arg contacts_min: Minimal number of contacts which residue may form with other residues, - by default 2. + Default is 2. :type contacts_min: int """ atoms = self._atoms @@ -5611,7 +5607,7 @@ def getLigandInteractions(self, **kwargs): :arg include_frames: used with filters, it will leave selected keyword in orginal lists, if False it will collect selected interactions in one list, Use True to assign new selection using setLigandInteractions. - by default True + Default is True :type include_frames: bool """ @@ -5661,7 +5657,7 @@ def getLigandInteractionsNumber(self, **kwargs): be a total number of interactions or it can be divided into interaction types. :arg types: Interaction types can be included (True) or not (False). - by default is True. + Default is True. :type types: bool """ @@ -5803,7 +5799,7 @@ def saveInteractionsPDB(self, **kwargs): :type filename: str :arg ligand_sele: ligand selection, - by default is 'all not (protein or water or ion)'. + Default is 'all not (protein or water or ion)'. :type ligand_sele: str """