diff --git a/pensa/__init__.py b/pensa/__init__.py index 8b58c6d..db8aa5b 100644 --- a/pensa/__init__.py +++ b/pensa/__init__.py @@ -2,6 +2,6 @@ from .features import * from .statesinfo import * from .clusters import * -from .comparison import * +from .comparison import * from .dimensionality import * diff --git a/pensa/features/__init__.py b/pensa/features/__init__.py index 3a7feb3..b6f4f27 100644 --- a/pensa/features/__init__.py +++ b/pensa/features/__init__.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# - * - coding: utf-8 - * - """ Methods to read in and process features from coordinates. @@ -13,4 +13,4 @@ from .txt_features import * from .csv_features import * from .hbond_features import * -#from .pyemma_features import * \ No newline at end of file +# from .pyemma_features import * diff --git a/pensa/features/atom_features.py b/pensa/features/atom_features.py index d075cd2..2b64e4f 100644 --- a/pensa/features/atom_features.py +++ b/pensa/features/atom_features.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# - * - coding: utf-8 - * - """ Methods to obtain a timeseries distribution for the atom/ion pockets' occupancies. @@ -19,8 +19,8 @@ from gridData import Grid from tqdm import tqdm import os -from pensa.preprocessing.density import * - +from pensa.preprocessing.density import \ + get_grid, data_out, write_atom_to_pdb, convert_to_occ, local_maxima_3D def get_atom_features(structure_input, xtc_input, atomgroup, element, top_atoms=10, @@ -58,7 +58,6 @@ def get_atom_features(structure_input, xtc_input, atomgroup, element, top_atoms= Data for all features """ - if write is not None: if out_name is None: print('WARNING: You must provide out_name if writing out result.') @@ -95,89 +94,89 @@ def get_atom_features(structure_input, xtc_input, atomgroup, element, top_atoms= else: g = Grid(grid_input) - ##converting the density to a probability + # converting the density to a probability atom_number = len(u.select_atoms('name ' + atomgroup)) - grid_data = np.array(g.grid)*atom_number/np.sum(np.array(g.grid)) + grid_data = np.array(g.grid) * atom_number / np.sum(np.array(g.grid)) - ##mask all probabilities below the average water probability - average_probability_density = atom_number/sum(1 for i in grid_data.flat if i) - ##mask all grid centers with density less than threshold density + # mask all probabilities below the average water probability + average_probability_density = atom_number / sum(1 for i in grid_data.flat if i) + # mask all grid centers with density less than threshold density grid_data[grid_data <= average_probability_density] = 0.0 xyz, val = local_maxima_3D(grid_data) - ##negate the array to get descending order from most prob to least prob - val_sort = np.argsort(-1*val.copy()) + # negate the array to get descending order from most prob to least prob + val_sort = np.argsort(-1 * val.copy()) # values = [val[i] for i in val_sort] coords = [xyz[max_val] for max_val in val_sort] maxdens_coord_str = [str(item)[1:-1] for item in coords] - atom_information=[] - atom_dists=[] - + atom_information = [] + atom_dists = [] if top_atoms > len(coords): top_atoms = len(coords) print('\n') print('Featurizing ', top_atoms, ' Atoms') + for atom_no in range(top_atoms): + print('\n') - print('Atom no: ', atom_no+1) + print('Atom no: ', atom_no + 1) print('\n') - counting=[] - shifted_coords=coords[atom_no]+g.origin + counting = [] + shifted_coords = coords[atom_no] + g.origin point_str = str(shifted_coords)[1:-1] - ## Find all water atoms within 2.5 Angstroms of density maxima + + # Find all water atoms within 2.5 Angstroms of density maxima for i in tqdm(range(len(u.trajectory))): - # for i in tqdm(range(100)): u.trajectory[i] - radius= ' 2.5' - atomgroup_IDS=list(u.select_atoms('name ' + atomgroup + ' and point ' + point_str +radius).indices) - if len(atomgroup_IDS)==0: - atomgroup_IDS=[-1] + radius = ' 2.5' + atomgroup_IDS = list(u.select_atoms('name ' + atomgroup + ' and point ' + point_str + radius).indices) + if len(atomgroup_IDS) == 0: + atomgroup_IDS[-1] counting.append(atomgroup_IDS) - ## Atom indices that appear in the atom site + # Atom indices that appear in the atom site flat_list = [item for sublist in counting for item in sublist] - atom_ID = 'a' + str(atom_no+1) + atom_ID = 'a' + str(atom_no + 1) atom_location = shifted_coords - pocket_occupation_frequency = 1 - flat_list.count(-1)/len(flat_list) + pocket_occupation_frequency = 1 - flat_list.count(-1) / len(flat_list) pocket_occupation_frequency = round(pocket_occupation_frequency, 4) atom_information.append([atom_ID, list(atom_location), pocket_occupation_frequency]) atom_dists.append(counting) - ## Write data out and visualize atom sites in pdb + # Write data out and visualize atom sites in pdb if write is True: data_out('atom_features/' + out_name + atom_ID + '.txt', [counting]) - data_out('atom_features/'+out_name+element+'AtomsSummary.txt', atom_information) + data_out('atom_features/' + out_name + element + 'AtomsSummary.txt', atom_information) write_atom_to_pdb(pdb_outname, atom_location, atom_ID, atomgroup) u_pdb = mda.Universe(pdb_outname) u_pdb.add_TopologyAttr('tempfactors') # Write values as beta-factors ("tempfactors") to a PDB file for res in range(len(atom_information)): - atom_resid = len(u_pdb.residues) - atom_no-1 + res + atom_resid = len(u_pdb.residues) - atom_no - 1 + res u_pdb.residues[atom_resid].atoms.tempfactors = atom_information[res][-1] u_pdb.atoms.write(pdb_outname) # Add atom pocket atomIDs - feature_names[element+'Pocket_Idx']= [atinfo[0] for atinfo in atom_information] - features_data[element+'Pocket_Idx']= np.array(atom_dists, dtype=object) + feature_names[element + 'Pocket_Idx'] = [atinfo[0] for atinfo in atom_information] + features_data[element + 'Pocket_Idx'] = np.array(atom_dists, dtype=object) # Add atom pocket frequencies - feature_names[element+'Pocket_Occup']= [atinfo[0] for atinfo in atom_information] - features_data[element+'Pocket_Occup']= np.array([atinfo[2] for atinfo in atom_information], dtype=object) + feature_names[element + 'Pocket_Occup'] = [atinfo[0] for atinfo in atom_information] + features_data[element + 'Pocket_Occup'] = np.array([atinfo[2] for atinfo in atom_information], dtype=object) # Add atom pocket occupancy timeseries - feature_names[element+'Pocket_OccupDistr']= [atinfo[0] for atinfo in atom_information] - features_data[element+'Pocket_OccupDistr']= np.array([convert_to_occ(distr, -1, water=False) for distr in atom_dists], dtype=object) + feature_names[element + 'Pocket_OccupDistr'] = [atinfo[0] for atinfo in atom_information] + features_data[element + 'Pocket_OccupDistr'] = np.array([convert_to_occ(distr, -1, water=False) for distr in atom_dists], dtype=object) # Add atom pocket locations - feature_names[element+'Pocket_xyz']= [atinfo[0] for atinfo in atom_information] - features_data[element+'Pocket_xyz']= np.array([atinfo[1] for atinfo in atom_information], dtype=object) + feature_names[element + 'Pocket_xyz'] = [atinfo[0] for atinfo in atom_information] + features_data[element + 'Pocket_xyz'] = np.array([atinfo[1] for atinfo in atom_information], dtype=object) # Return the dictionaries. return feature_names, features_data - diff --git a/pensa/features/csv_features.py b/pensa/features/csv_features.py index 85b6d12..0df9fb6 100644 --- a/pensa/features/csv_features.py +++ b/pensa/features/csv_features.py @@ -68,4 +68,3 @@ def get_drormd_features(csv_file): for i, f in enumerate(feature_names): feature_data[:, i] = df[f] return feature_names, feature_data - diff --git a/pensa/features/hbond_features.py b/pensa/features/hbond_features.py index 7d07020..3650052 100644 --- a/pensa/features/hbond_features.py +++ b/pensa/features/hbond_features.py @@ -1,21 +1,22 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- +# - * - coding: utf-8 - * - """ Created on Fri May 6 12:03:52 2022 @author: neil """ import MDAnalysis as mda -from MDAnalysis.analysis.hydrogenbonds.hbond_analysis import ( - HydrogenBondAnalysis as HBA) +from MDAnalysis.analysis.hydrogenbonds.hbond_analysis import (HydrogenBondAnalysis as HBA) import numpy as np from gridData import Grid from tqdm import tqdm import os -from pensa.preprocessing.density import * +from pensa.preprocessing.density import get_grid, local_maxima_3D, write_atom_to_pdb + def Unique_bonding_pairs(lst): - return ([list(i) for i in {*[tuple(sorted(i)) for i in lst]}]) + return ([list(i) for i in {* [tuple(sorted(i)) for i in lst]}]) + def get_cavity_bonds(structure_input, xtc_input, atomgroups, site_IDs, grid_input=None, write=None, write_grid_as=None, out_name=None): @@ -79,28 +80,27 @@ def get_cavity_bonds(structure_input, xtc_input, atomgroups, site_IDs, u.trajectory.rewind() if grid_input is None: - g = get_grid(u, atomgroups[0], write_grid_as, out_name) + g = get_grid(u, atomgroups[0], write_grid_as, out_name) else: g = Grid(grid_input) elif grid_input is None: - g = get_grid(u, atomgroup) + g = get_grid(u, atomgroups) else: g = Grid(grid_input) xyz, val = local_maxima_3D(g.grid) - ## Negate the array to get probabilities in descending order - val_sort = np.argsort(-1*val.copy()) + # Negate the array to get probabilities in descending order + val_sort = np.argsort(-1 * val.copy()) coords = [xyz[max_val] for max_val in val_sort] maxdens_coord_str = [str(item)[1:-1] for item in coords] - site_information=[] - O_hbonds_all_site=[] - H_hbonds_all_site=[] - + site_information = [] + O_hbonds_all_site = [] + H_hbonds_all_site = [] for site_no in site_IDs: - feature_names['W'+str(site_no)]={} - features_data['W'+str(site_no)]={} + feature_names['W' + str(site_no)] = {} + features_data['W' + str(site_no)] = {} ID_to_idx = site_no - 1 print('\n') @@ -108,94 +108,93 @@ def get_cavity_bonds(structure_input, xtc_input, atomgroups, site_IDs, print('\n') O_hbonds = [] H_hbonds = [] - ## Find all water atoms within 3.5 Angstroms of density maxima + # Find all water atoms within 3.5 Angstroms of density maxima # Shifting the coordinates of the maxima by the grid origin to match # the simulation box coordinates - shifted_coords=coords[ID_to_idx]+g.origin + shifted_coords = coords[ID_to_idx] + g.origin point_str = str(shifted_coords)[1:-1] - counting=[] + counting = [] if write is True: - write_atom_to_pdb(pdb_outname, shifted_coords, 'W'+str(site_no), atomgroups[0]) + write_atom_to_pdb(pdb_outname, shifted_coords, 'W' + str(site_no), atomgroups[0]) for frame_no in tqdm(range(len(u.trajectory))): - # for frame_no in tqdm(range(100)): u.trajectory[frame_no] radius = ' 3.5' atomgroup_IDS = list(u.select_atoms('byres (name ' + atomgroups[0] + ' and point ' + point_str + radius + ')').indices)[::3] counting.append(list(set(atomgroup_IDS))) - ## Water atom indices that appear in the water site + # Water atom indices that appear in the water site flat_list = [item for sublist in counting for item in sublist] - ## Extract water orientation timeseries + + # Extract water orientation timeseries for frame_no in tqdm(range(len(u.trajectory))): - # for frame_no in tqdm(range(100)): u.trajectory[frame_no] - site_resid=counting[frame_no] + site_resid = counting[frame_no] # print(site_resid) - if len(site_resid)==1: - ## (x, y, z) positions for the water oxygen at trajectory frame_no + + if len(site_resid) == 1: + # (x, y, z) positions for the water oxygen at trajectory frame_no proxprotatg = 'protein and around 3.5 byres index ' + str(site_resid[0]) O_site = 'index ' + str(site_resid[0]) - H_site = '((byres index ' + str(site_resid[0]) + ') and (name '+atomgroups[1]+' or name '+atomgroups[2]+'))' + H_site = '((byres index ' + str(site_resid[0]) + ') and (name ' + atomgroups[1] + ' or name ' + atomgroups[2] + '))' hbond = HBA(universe=u) protein_hydrogens_sel = hbond.guess_hydrogens(proxprotatg) protein_acceptors_sel = hbond.guess_acceptors(proxprotatg) - if len(protein_hydrogens_sel)!=0: # bonds formed by the water oxygen - O_bonds = '( '+protein_hydrogens_sel+' ) and around 3.5 '+O_site + if len(protein_hydrogens_sel) != 0: + O_bonds = '( ' + protein_hydrogens_sel + ' ) and around 3.5 ' + O_site else: O_bonds = '' # bonds formed by the water hydrogens - if len(protein_acceptors_sel)!=0: - H_bonds = '( '+ protein_acceptors_sel+' ) and around 3.5 '+H_site + if len(protein_acceptors_sel) != 0: + H_bonds = '( ' + protein_acceptors_sel + ' ) and around 3.5 ' + H_site else: - H_bonds='' + H_bonds = '' H_hbond_group = u.select_atoms(H_bonds) H_hbonds.append(H_hbond_group) O_hbond_group = u.select_atoms(O_bonds) O_hbonds.append(O_hbond_group) - ## Featurize water with highest pocket occupation (if multiple waters in pocket) - elif len(site_resid)>1: - freq_count=[] + # Featurize water with highest pocket occupation (if multiple waters in pocket) + elif len(site_resid) > 1: + freq_count = [] for ID in site_resid: freq_count.append([flat_list.count(ID), ID]) - freq_count.sort(key = lambda x: x[0]) + freq_count.sort(key=lambda x: x[0]) proxprotatg = 'protein and around 3.5 byres index ' + str(freq_count[-1][1]) O_site = 'index ' + str(freq_count[-1][1]) - H_site = '((byres index ' + str(freq_count[-1][1]) + ') and (name '+atomgroups[1]+' or name '+atomgroups[2]+'))' + H_site = '((byres index ' + str(freq_count[-1][1]) + ') and (name ' + atomgroups[1] + ' or name ' + atomgroups[2] + '))' hbond = HBA(universe=u) protein_hydrogens_sel = hbond.guess_hydrogens(proxprotatg) protein_acceptors_sel = hbond.guess_acceptors(proxprotatg) - if len(protein_hydrogens_sel)!=0: # bonds formed by the water oxygen - O_bonds = '( '+protein_hydrogens_sel+' ) and around 3.5 '+O_site + if len(protein_hydrogens_sel) != 0: + O_bonds = '( ' + protein_hydrogens_sel + ' ) and around 3.5 ' + O_site else: O_bonds = '' # bonds formed by the water hydrogens - if len(protein_acceptors_sel)!=0: - H_bonds = '( '+ protein_acceptors_sel+' ) and around 3.5 '+H_site + if len(protein_acceptors_sel) != 0: + H_bonds = '( ' + protein_acceptors_sel + ' ) and around 3.5 ' + H_site else: - H_bonds='' + H_bonds = '' H_hbond_group = u.select_atoms(H_bonds) H_hbonds.append(H_hbond_group) O_hbond_group = u.select_atoms(O_bonds) O_hbonds.append(O_hbond_group) - ## 10000.0 = no waters bound - elif len(site_resid)<1: + # 10000.0 = no waters bound + elif len(site_resid) < 1: O_hbonds.append("unocc") H_hbonds.append("unocc") - - bondouts=[] + bondouts = [] for bondtype in [O_hbonds, H_hbonds]: resids = [] for line in bondtype: @@ -203,21 +202,21 @@ def get_cavity_bonds(structure_input, xtc_input, atomgroups, site_IDs, resids.append([line]) else: idxes = [8, 10, 2] - all_atgs=[] + all_atgs = [] for atg in range(len(line)): # print(line[atg]) stringdex = [str(line[atg]).split(' ')[idx] for idx in idxes] - all_atgs.append(stringdex[0][:-1]+" "+stringdex[1]+" "+stringdex[2]) + all_atgs.append(stringdex[0][:-1] + " " + stringdex[1] + " " + stringdex[2]) resids.append(all_atgs) names = list(set([flat for sub in resids for flat in sub])) - if names.count('unocc')>0: + if names.count('unocc') > 0: names.remove('unocc') dist = np.zeros((len(names), len(u.trajectory))) for bondsite in range(len(names)): for frame in range(len(resids)): - if resids[frame].count(names[bondsite])>0: - dist[bondsite][frame]=1 + if resids[frame].count(names[bondsite]) > 0: + dist[bondsite][frame] = 1 bondouts.append([names, dist]) O_site_pdb_id = "O" + str(site_no) @@ -226,20 +225,20 @@ def get_cavity_bonds(structure_input, xtc_input, atomgroups, site_IDs, # "FIX OUTPUT UNIFORMITY, SINGLE BONDS NOT OUTPUT WITH ANY ARRAY DIMENSION" if write is True: np.savetxt('h2o_hbonds/' + out_name + O_site_pdb_id + '_names.txt', np.array(bondouts[0][0], dtype=object), fmt='%s') - np.savetxt('h2o_hbonds/' + out_name + O_site_pdb_id + '_data.txt', np.array(bondouts[0][1], dtype=object), fmt='%s') + np.savetxt('h2o_hbonds/' + out_name + O_site_pdb_id + '_data.txt', np.array(bondouts[0][1], dtype=object), fmt='%s') np.savetxt('h2o_hbonds/' + out_name + H_site_pdb_id + '_names.txt', np.array(bondouts[1][0], dtype=object), fmt='%s') - np.savetxt('h2o_hbonds/' + out_name + H_site_pdb_id + '_data.txt', np.array(bondouts[1][1], dtype=object), fmt='%s') + np.savetxt('h2o_hbonds/' + out_name + H_site_pdb_id + '_data.txt', np.array(bondouts[1][1], dtype=object), fmt='%s') - - feature_names['W'+str(site_no)]['acceptor_names'] = np.array(bondouts[0][0], dtype=object) - feature_names['W'+str(site_no)]['donor_names'] = np.array(bondouts[1][0], dtype=object) - features_data['W'+str(site_no)]['acceptor_timeseries'] = np.array(bondouts[0][1], dtype=object) - features_data['W'+str(site_no)]['donor_timeseries'] = np.array(bondouts[1][1], dtype=object) - features_data['W'+str(site_no)]['acceptor_frequencies'] = np.sum(np.array(bondouts[0][1], dtype=object), axis=1)/len(u.trajectory) - features_data['W'+str(site_no)]['donor_frequencies'] = np.sum(np.array(bondouts[1][1], dtype=object), axis=1)/len(u.trajectory) + feature_names['W' + str(site_no)]['acceptor_names'] = np.array(bondouts[0][0], dtype=object) + feature_names['W' + str(site_no)]['donor_names'] = np.array(bondouts[1][0], dtype=object) + features_data['W' + str(site_no)]['acceptor_timeseries'] = np.array(bondouts[0][1], dtype=object) + features_data['W' + str(site_no)]['donor_timeseries'] = np.array(bondouts[1][1], dtype=object) + features_data['W' + str(site_no)]['acceptor_frequencies'] = np.sum(np.array(bondouts[0][1], dtype=object), axis=1) / len(u.trajectory) + features_data['W' + str(site_no)]['donor_frequencies'] = np.sum(np.array(bondouts[1][1], dtype=object), axis=1) / len(u.trajectory) return feature_names, features_data + def get_h_bonds(structure_input, xtc_input, fixed_group, dyn_group, write=None, out_name=None): """ Find hydrogen bonding partners for atomgroup1 in atomgroup2. @@ -308,19 +307,19 @@ def get_h_bonds(structure_input, xtc_input, fixed_group, dyn_group, write=None, # bonds for [[atomgroup1 donors] , [atomgroup1 acceptors]] all_bonds = [[], []] for frame_no in tqdm(range(len(u.trajectory))): - # for frame_no in tqdm(range(100)): + # find the frame u.trajectory[frame_no] # obtain indices for all donor and acceptor atoms frame_bonds = [] for donor1_idx in donor1_idcs: idx_bonds = [] # find donor positions - donor1_pos = np.array(u.select_atoms("index "+str(donor1_idx)).positions) + donor1_pos = np.array(u.select_atoms("index " + str(donor1_idx)).positions) for acceptor2_idx in acceptor2_idcs: # find acceptor positions - acceptor2_pos = np.array(u.select_atoms("index "+str(acceptor2_idx)).positions) + acceptor2_pos = np.array(u.select_atoms("index " + str(acceptor2_idx)).positions) # if distance between atoms less than 3.5 angstrom then count as bond - if np.linalg.norm(donor1_pos-acceptor2_pos) < 3.5: + if np.linalg.norm(donor1_pos - acceptor2_pos) < 3.5: idx_bonds.append([donor1_idx, acceptor2_idx]) # print(idx_bonds) frame_bonds.append(idx_bonds) @@ -331,12 +330,12 @@ def get_h_bonds(structure_input, xtc_input, fixed_group, dyn_group, write=None, for donor2_idx in donor2_idcs: idx_bonds = [] # find donor positions - donor2_pos = np.array(u.select_atoms("index "+str(donor2_idx)).positions) + donor2_pos = np.array(u.select_atoms("index " + str(donor2_idx)).positions) for acceptor1_idx in acceptor1_idcs: # find acceptor positions - acceptor1_pos = np.array(u.select_atoms("index "+str(acceptor1_idx)).positions) + acceptor1_pos = np.array(u.select_atoms("index " + str(acceptor1_idx)).positions) # if distance between atoms less than 3.5 angstrom then count as bond - if np.linalg.norm(donor2_pos-acceptor1_pos) < 3.5: + if np.linalg.norm(donor2_pos - acceptor1_pos) < 3.5: idx_bonds.append([donor2_idx, acceptor1_idx]) # print(idx_bonds) frame_bonds.append(idx_bonds) @@ -346,28 +345,26 @@ def get_h_bonds(structure_input, xtc_input, fixed_group, dyn_group, write=None, all_donor_pairs = Unique_bonding_pairs([y for subl in [x for sub in all_bonds[0] for x in sub] for y in subl]) all_acceptor_pairs = Unique_bonding_pairs([y for subl in [x for sub in all_bonds[1] for x in sub] for y in subl]) - all_donor_pair_names = [[atg_to_names(u.select_atoms('index '+str(i[0])))[0], atg_to_names(u.select_atoms('index '+str(i[1])))[0]] for i in all_donor_pairs] - all_acceptor_pair_names = [[atg_to_names(u.select_atoms('index '+str(i[0])))[0], atg_to_names(u.select_atoms('index '+str(i[1])))[0]] for i in all_acceptor_pairs] + all_donor_pair_names = [[atg_to_names(u.select_atoms('index ' + str(i[0])))[0], atg_to_names(u.select_atoms('index ' + str(i[1])))[0]] for i in all_donor_pairs] + all_acceptor_pair_names = [[atg_to_names(u.select_atoms('index ' + str(i[0])))[0], atg_to_names(u.select_atoms('index ' + str(i[1])))[0]] for i in all_acceptor_pairs] - donor_dist =np.zeros((len(all_donor_pairs), len(u.trajectory))) + donor_dist = np.zeros((len(all_donor_pairs), len(u.trajectory))) acceptor_dist = np.zeros((len(all_acceptor_pairs), len(u.trajectory))) for frame in tqdm(range(len(u.trajectory))): - # for frame_no in tqdm(range(100)): for pair in range(len(all_donor_pairs)): if list(reversed(all_donor_pairs[pair])) in [flat for sub in all_bonds[0][frame_no] for flat in sub]: - donor_dist[pair][frame_no]=1 + donor_dist[pair][frame_no] = 1 for pair in range(len(all_acceptor_pairs)): if list(reversed(all_acceptor_pairs[pair])) in [flat for sub in all_bonds[1][frame_no] for flat in sub]: - acceptor_dist[pair][frame_no]=1 - + acceptor_dist[pair][frame_no] = 1 # Write data out and visualize water sites in pdb if write is True: - np.savetxt('lig_hbonds/'+out_name+'all_donor_pair_names.txt', np.array(all_donor_pair_names, dtype=object), fmt='%s') - np.savetxt('lig_hbonds/'+out_name+'all_acceptor_pair_names.txt', np.array(all_acceptor_pair_names, dtype=object), fmt='%s') - np.savetxt('lig_hbonds/'+out_name+'all_donor_pair_data.txt', np.array(donor_dist, dtype=object), fmt='%s') - np.savetxt('lig_hbonds/'+out_name+'all_acceptor_pair_data.txt', np.array(acceptor_dist, dtype=object), fmt='%s') + np.savetxt('lig_hbonds/' + out_name + 'all_donor_pair_names.txt', np.array(all_donor_pair_names, dtype=object), fmt='%s') + np.savetxt('lig_hbonds/' + out_name + 'all_acceptor_pair_names.txt', np.array(all_acceptor_pair_names, dtype=object), fmt='%s') + np.savetxt('lig_hbonds/' + out_name + 'all_donor_pair_data.txt', np.array(donor_dist, dtype=object), fmt='%s') + np.savetxt('lig_hbonds/' + out_name + 'all_acceptor_pair_data.txt', np.array(acceptor_dist, dtype=object), fmt='%s') feature_names['donor_names'] = np.array(all_donor_pair_names) feature_names['acceptor_names'] = np.array(all_acceptor_pair_names) @@ -376,12 +373,12 @@ def get_h_bonds(structure_input, xtc_input, fixed_group, dyn_group, write=None, return feature_names, features_data + def atg_to_names(atg): idxes = [8, 10, 2] - all_atgs=[] + all_atgs = [] print(atg) for line in range(len(atg)): stringdex = [str(atg[line]).split(' ')[idx] for idx in idxes] - all_atgs.append(stringdex[0][:-1]+" "+stringdex[1]+" "+stringdex[2]) + all_atgs.append(stringdex[0][:-1] + " " + stringdex[1] + " " + stringdex[2]) return all_atgs - diff --git a/pensa/features/mda_combined.py b/pensa/features/mda_combined.py index 6123778..1dfb255 100644 --- a/pensa/features/mda_combined.py +++ b/pensa/features/mda_combined.py @@ -1,4 +1,3 @@ -import pensa from .mda_torsions import get_protein_backbone_torsions, get_protein_sidechain_torsions from .mda_distances import get_calpha_distances from pensa.features.processing import get_feature_timeseries @@ -52,7 +51,7 @@ def get_structure_features(pdb, xtc, start_frame=0, step_width=1, cossin=False, first_frame=start_frame, last_frame=None, step=step_width, naming='segindex', radians=True, include_omega=False - ) + ) feature_names['bb-torsions'] = bbtorsions[0] features_data['bb-torsions'] = bbtorsions[1] # Add sidechain torsions. @@ -61,7 +60,7 @@ def get_structure_features(pdb, xtc, start_frame=0, step_width=1, cossin=False, pdb, xtc, selection='all', first_frame=start_frame, last_frame=None, step=step_width, naming='segindex', radians=True - ) + ) feature_names['sc-torsions'] = sctorsions[0] features_data['sc-torsions'] = sctorsions[1] # Add backbone C-alpha distances. @@ -69,7 +68,7 @@ def get_structure_features(pdb, xtc, start_frame=0, step_width=1, cossin=False, bbdistances = get_calpha_distances( pdb, xtc, first_frame=start_frame, last_frame=None, step=step_width, - ) + ) feature_names['bb-distances'] = bbdistances[0] features_data['bb-distances'] = bbdistances[1] # Remove the residue-number offset @@ -103,22 +102,22 @@ def _remove_resnum_offset(features, offset): if 'bb-torsions' in features.keys(): for f in features['bb-torsions']: fsplit = f.split(' ') - resnum = int(f.split(' ')[3])-offset + resnum = int(f.split(' ')[3]) - offset fsplit[3] = str(resnum) new_features['bb-torsions'].append(' '.join(fsplit)) if 'sc-torsions' in features.keys(): for f in features['sc-torsions']: fsplit = f.split(' ') - resnum = int(f.split(' ')[3])-offset + resnum = int(f.split(' ')[3]) - offset fsplit[3] = str(resnum) new_features['sc-torsions'].append(' '.join(fsplit)) if 'bb-distances' in features.keys(): for f in features['bb-distances']: fsplit = f.split(' ') - resnum1 = int(f.split(' ')[2])-offset - resnum2 = int(f.split(' ')[6])-offset + resnum1 = int(f.split(' ')[2]) - offset + resnum2 = int(f.split(' ')[6]) - offset fsplit[2] = str(resnum1) fsplit[6] = str(resnum2) new_features['bb-distances'].append(' '.join(fsplit)) diff --git a/pensa/features/mda_distances.py b/pensa/features/mda_distances.py index 0a7797c..55a7e16 100644 --- a/pensa/features/mda_distances.py +++ b/pensa/features/mda_distances.py @@ -1,4 +1,3 @@ -import pensa import numpy as np import MDAnalysis as mda import MDAnalysis.lib.distances as ld @@ -49,15 +48,15 @@ def get_atom_group_distances(pdb, xtc, sel_a='protein', sel_b='resname LIG', num_at_b = len(b) # Name the atoms - if naming=='chainid': - at_labels_a = ['%s %s %s %s'%(atom.chainID, atom.residue.resname, atom.resid, atom.name) for atom in a] - at_labels_b = ['%s %s %s %s'%(atom.chainID, atom.residue.resname, atom.resid, atom.name) for atom in b] - elif naming=='segid': - at_labels_a = ['%s %s %s %s'%(atom.segid, atom.residue.resname, atom.resid, atom.name) for atom in a] - at_labels_b = ['%s %s %s %s'%(atom.segid, atom.residue.resname, atom.resid, atom.name) for atom in b] + if naming == 'chainid': + at_labels_a = ['%s %s %s %s' % (atom.chainID, atom.residue.resname, atom.resid, atom.name) for atom in a] + at_labels_b = ['%s %s %s %s' % (atom.chainID, atom.residue.resname, atom.resid, atom.name) for atom in b] + elif naming == 'segid': + at_labels_a = ['%s %s %s %s' % (atom.segid, atom.residue.resname, atom.resid, atom.name) for atom in a] + at_labels_b = ['%s %s %s %s' % (atom.segid, atom.residue.resname, atom.resid, atom.name) for atom in b] else: - at_labels_a = ['%s %s %s'%(atom.residue.resname, atom.resid, atom.name) for atom in a] - at_labels_b = ['%s %s %s'%(atom.residue.resname, atom.resid, atom.name) for atom in b] + at_labels_a = ['%s %s %s' % (atom.residue.resname, atom.resid, atom.name) for atom in a] + at_labels_b = ['%s %s %s' % (atom.residue.resname, atom.resid, atom.name) for atom in b] # Name the distance labels d_labels = [] @@ -65,12 +64,12 @@ def get_atom_group_distances(pdb, xtc, sel_a='protein', sel_b='resname LIG', for i in range(num_at_a): for j in range(num_at_b): k += 1 - _dl = 'DIST: %s - %s'%(at_labels_a[i], at_labels_b[j]) + _dl = 'DIST: %s - %s' % (at_labels_a[i], at_labels_b[j]) d_labels.append(_dl) # Calculate the distances num_at = len(a) - num_dist = int(num_at*(num_at-1)/2) + num_dist = int(num_at * (num_at - 1) / 2) len_traj = len(u.trajectory[first_frame:last_frame:step]) template = np.zeros([num_dist, ]) data_arr = np.zeros([len_traj, num_dist]) @@ -120,12 +119,12 @@ def get_atom_self_distances(pdb, xtc, selection='all', first_frame=0, last_frame num_at = len(a) # Name the atoms - if naming=='chainid': - at_labels = ['%s %s %s %s'%(atom.chainID, atom.residue.resname, atom.resid, atom.name) for atom in a] - elif naming=='segid': - at_labels = ['%s %s %s %s'%(atom.segid, atom.residue.resname, atom.resid, atom.name) for atom in a] + if naming == 'chainid': + at_labels = ['%s %s %s %s' % (atom.chainID, atom.residue.resname, atom.resid, atom.name) for atom in a] + elif naming == 'segid': + at_labels = ['%s %s %s %s' % (atom.segid, atom.residue.resname, atom.resid, atom.name) for atom in a] else: - at_labels = ['%s %s %s'%(atom.residue.resname, atom.resid, atom.name) for atom in a] + at_labels = ['%s %s %s' % (atom.residue.resname, atom.resid, atom.name) for atom in a] # Name the distance labels d_labels = [] @@ -133,12 +132,12 @@ def get_atom_self_distances(pdb, xtc, selection='all', first_frame=0, last_frame for i in range(num_at): for j in range(i + 1, num_at): k += 1 - _dl = 'DIST: %s - %s'%(at_labels[i], at_labels[j]) + _dl = 'DIST: %s - %s' % (at_labels[i], at_labels[j]) d_labels.append(_dl) # Calculate the distances num_at = len(a) - num_dist = int(num_at*(num_at-1)/2) + num_dist = int(num_at * (num_at - 1) / 2) len_traj = len(u.trajectory[first_frame:last_frame:step]) template = np.zeros([num_dist, ]) data_arr = np.zeros([len_traj, num_dist]) @@ -175,11 +174,11 @@ def get_calpha_distances(pdb, xtc, first_frame=0, last_frame=None, step=1): Data for all C-alpha distances [Å] """ - names, data = get_atom_self_distances(pdb, xtc, - selection='name CA', - first_frame=first_frame, - last_frame=last_frame, - step=step) + names, data = get_atom_self_distances(pdb, xtc, + selection='name CA', + first_frame=first_frame, + last_frame=last_frame, + step=step) return names, data @@ -246,21 +245,19 @@ def get_gpcr_calpha_distances(pdb, xtc, gpcr_name, res_dbnum, # Create the selection string selection = 'name CA and resid' for rn in resnums: - selection += ' %i'%rn + selection += ' %i' % rn # Create the GPCRdb distance labels distlabels = [] k = -1 for i in range(len(reslabels)): for j in range(i + 1, len(reslabels)): k += 1 - _dl = 'CA DIST: %s - %s'%(reslabels[i], reslabels[j]) + _dl = 'CA DIST: %s - %s' % (reslabels[i], reslabels[j]) distlabels.append(_dl) # Calculate the distances and get the sequential names - names, data = get_atom_self_distances(pdb, xtc, - selection=selection, - first_frame=first_frame, - last_frame=last_frame, - step=step) + names, data = get_atom_self_distances(pdb, xtc, + selection=selection, + first_frame=first_frame, + last_frame=last_frame, + step=step) return names, distlabels, data - - diff --git a/pensa/features/mda_torsions.py b/pensa/features/mda_torsions.py index 3fd1249..84d7c3f 100644 --- a/pensa/features/mda_torsions.py +++ b/pensa/features/mda_torsions.py @@ -1,4 +1,3 @@ -import pensa import numpy as np import MDAnalysis as mda from MDAnalysis.analysis.dihedrals import Dihedral @@ -49,16 +48,16 @@ def get_torsions(pdb, xtc, sel=[[0, 1, 2, 3], [1, 2, 3, 4]], first_frame=0, last torsion_labels = [] for ta in torsion_atoms: # Name the atoms - if naming=='chainid': - at_labels = ['%s %s %s %s'%(atom.chainID, atom.residue.resname, atom.resid, atom.name) for atom in ta] - elif naming=='segid': - at_labels = ['%s %s %s %s'%(atom.segid, atom.residue.resname, atom.resid, atom.name) for atom in ta] - elif naming=='segindex': - at_labels = ['%s %s %s %s'%(atom.segindex, atom.residue.resname, atom.resid, atom.name) for atom in ta] + if naming == 'chainid': + at_labels = ['%s %s %s %s' % (atom.chainID, atom.residue.resname, atom.resid, atom.name) for atom in ta] + elif naming == 'segid': + at_labels = ['%s %s %s %s' % (atom.segid, atom.residue.resname, atom.resid, atom.name) for atom in ta] + elif naming == 'segindex': + at_labels = ['%s %s %s %s' % (atom.segindex, atom.residue.resname, atom.resid, atom.name) for atom in ta] else: - at_labels = ['%s %s %s'%(atom.residue.resname, atom.resid, atom.name) for atom in ta] + at_labels = ['%s %s %s' % (atom.residue.resname, atom.resid, atom.name) for atom in ta] # Name the torsion labels - _tl = 'TORS: %s - %s - %s - %s'%(at_labels[0], at_labels[1], at_labels[2], at_labels[3]) + _tl = 'TORS: %s - %s - %s - %s' % (at_labels[0], at_labels[1], at_labels[2], at_labels[3]) torsion_labels.append(_tl) return torsion_labels, dihedrals.angles @@ -87,72 +86,9 @@ def find_atom_by_name(res, at_name): return -1 -def find_atom_indices_per_residue(pdb, at_names=["C4'", "P", "C4'", "P"], rel_res=[-1, 0, 0, 1], - selection='all', verbose=False): - """ - Find the indices of atoms with a certain name for each residue (and its neighbors). - - Parameters - ---------- - pdb : str - File name for the reference file (PDB or GRO format). - at_names : list of str - Names of the requested atoms. - rel_res : list of int, default=[[0, 1, 2, 3]] - Residue number of each atom's residue relative to the current residue. - selection : str, default = 'all' - MDAnalysis selection string - verbose : bool, default = False - Print info for all residues. - - Returns - ------- - feature_names : list of str - Generic names of all torsions - features_data : numpy array - Data for all torsions [Å] - """ - - assert len(at_names) == len(rel_res) - rel_res = np.array(rel_res, dtype=int) - - u = mda.Universe(pdb) - a = u.select_atoms(selection) - r = a.residues - - indices_list = [] - - # In each residue .. - for i, res in enumerate(r): - - sel_resnums = res.resnum + rel_res - sel_resinds = i + rel_res - - # Check whether all residue indices are present. - if not np.all(sel_resinds < len(r)): - continue - - # Check whether consecutive indices correspond to consecutive residue numbers - # (we don't want to calculate torsions between non-connected residues) - if not np.all(sel_resnums == r[sel_resinds].resnums): - continue - - indices = -np.ones(len(at_names), dtype=int) - # For each selection name ... - for j, at_name in enumerate(at_names): - # ... check each atom. - indices[j] = find_atom_by_name(r[i+rel_res[j]], at_name) - - if np.all(indices >= 0): - if verbose: print(indices, a[indices].names, a[indices].resids) - indices_list.append(indices) - - return indices_list - - -def list_depth(l): - if isinstance(l, list): - return 1 + max(list_depth(item) for item in l) +def list_depth(a_list): + if isinstance(a_list, list): + return 1 + max(list_depth(item) for item in a_list) else: return 0 @@ -222,12 +158,13 @@ def find_atom_indices_per_residue(pdb, at_names=["C4'", "P", "C4'", "P"], rel_re indices = -np.ones(len(rel_res), dtype=int) for j, at_name in enumerate(at_names[num_at_sets]): # ... check each atom. - indices[j] = find_atom_by_name(r[i+rel_res[j]], at_name) + indices[j] = find_atom_by_name(r[i + rel_res[j]], at_name) num_indices = np.sum(indices >= 0) num_at_sets += 1 if num_indices == len(rel_res): - if verbose: print(indices, a[indices].names, a[indices].resids) + if verbose: + print(indices, a[indices].names, a[indices].resids) indices_list.append(indices) return indices_list @@ -243,8 +180,8 @@ def get_nucleicacid_backbone_torsions(pdb, xtc, selection='all', BETA (β): P(i)-O5'(i)-C5'(i)-C4'(i) GAMMA (γ): O5'(i)-C5'(i)-C4'(i)-C3'(i) DELTA (δ): C5'(i)-C4'(i)-C3'(i)-O3'(i) - EPSILON (ε): C4'(i)-C3'(i)-O3'(i)-P(i+1) - ZETA (ζ): C3'(i)-O3'(i)-P(i+1)-O5'(i+1) + EPSILON (ε): C4'(i)-C3'(i)-O3'(i)-P(i + 1) + ZETA (ζ): C3'(i)-O3'(i)-P(i + 1)-O5'(i + 1) CHI (χ): O4'(i)-C1'(i)-N9(i)-C4(i) for purines O4'(i)-C1'(i)-N1(i)-C2(i) for pyridines @@ -308,14 +245,14 @@ def get_nucleicacid_backbone_torsions(pdb, xtc, selection='all', rel_res=[0, 0, 0, 0], selection=selection ) - # EPSILON (ε): C4'(i)-C3'(i)-O3'(i)-P(i+1) + # EPSILON (ε): C4'(i)-C3'(i)-O3'(i)-P(i + 1) indices_epsilon = find_atom_indices_per_residue( pdb, at_names=["C4'", "C3'", "O3'", "P"], rel_res=[0, 0, 0, 1], selection=selection ) - # ZETA (ζ): C3'(i)-O3'(i)-P(i+1)-O5'(i+1) + # ZETA (ζ): C3'(i)-O3'(i)-P(i + 1)-O5'(i + 1) indices_zeta = find_atom_indices_per_residue( pdb, at_names=["C3'", "O3'", "P", "O5'"], @@ -333,31 +270,32 @@ def get_nucleicacid_backbone_torsions(pdb, xtc, selection='all', # Define angle names for labels angles = [] - angles += ['ALPHA']*len(indices_alpha) - angles += ['BETA']*len(indices_beta) - angles += ['GAMMA']*len(indices_gamma) - angles += ['DELTA']*len(indices_delta) - angles += ['EPSILON']*len(indices_epsilon) - angles += ['ZETA']*len(indices_zeta) - angles += ['CHI']*len(indices_chi) + angles += ['ALPHA'] * len(indices_alpha) + angles += ['BETA'] * len(indices_beta) + angles += ['GAMMA'] * len(indices_gamma) + angles += ['DELTA'] * len(indices_delta) + angles += ['EPSILON'] * len(indices_epsilon) + angles += ['ZETA'] * len(indices_zeta) + angles += ['CHI'] * len(indices_chi) # Calculate the torsions + all_indices = indices_alpha + indices_beta + indices_gamma + indices_delta + indices_epsilon + indices_zeta + indices_chi torsions = get_torsions( - pdb, xtc, sel=indices_alpha+indices_beta+indices_gamma+indices_delta+indices_epsilon+indices_zeta+indices_chi, + pdb, xtc, sel=all_indices, first_frame=0, last_frame=None, step=1, - naming = naming + naming=naming ) # Extract the residue info nums = [pti.split(' - ')[1].split(' ')[-2] for pti in torsions[0]] names = [pti.split(' - ')[1].split(' ')[-3] for pti in torsions[0]] - if naming=='chainid' or naming=='segid' or naming=='segindex': + if naming == 'chainid' or naming == 'segid' or naming == 'segindex': seg = [pti.split(' - ')[1].split(' ')[-4] for pti in torsions[0]] else: - seg = ['0']*len(angles) + seg = ['0'] * len(angles) # Construct label names - labels = [ ang+' '+seg[i]+' '+names[i]+' '+nums[i] for i, ang in enumerate(angles)] + labels = [ang + ' ' + seg[i] + ' ' + names[i] + ' ' + nums[i] for i, ang in enumerate(angles)] # Convert to radians if so desired if radians: - values = torsions[1] * np.pi/180 + values = torsions[1] * np.pi / 180 else: values = torsions[1] return labels, values @@ -369,8 +307,8 @@ def get_nucleicacid_pseudotorsions(pdb, xtc, selection='all', """ Load nucleic acid pseudotorsions - ETA (η): C4'(i-1)-P(i)-C4'(i)-P(i+1) - THETA (θ): P(i)-C4'(i)-P(i+1)-C4'(i+1) + ETA (η): C4'(i-1)-P(i)-C4'(i)-P(i + 1) + THETA (θ): P(i)-C4'(i)-P(i + 1)-C4'(i + 1) Parameters ---------- @@ -417,40 +355,40 @@ def get_nucleicacid_pseudotorsions(pdb, xtc, selection='all', selection=selection ) # Define angle names for labels - angles = ['ETA']*len(indices_eta) + ['THETA']*len(indices_theta) + angles = ['ETA'] * len(indices_eta) + ['THETA'] * len(indices_theta) # Calculate the torsions torsions = get_torsions( - pdb, xtc, sel=indices_eta+indices_theta, + pdb, xtc, sel=indices_eta + indices_theta, first_frame=0, last_frame=None, step=1, - naming = naming + naming=naming ) # Extract the residue info nums = [pti.split(' - ')[1].split(' ')[-2] for pti in torsions[0]] names = [pti.split(' - ')[1].split(' ')[-3] for pti in torsions[0]] - if naming=='chainid' or naming=='segid' or naming=='segindex': + if naming == 'chainid' or naming == 'segid' or naming == 'segindex': seg = [pti.split(' - ')[1].split(' ')[-4] for pti in torsions[0]] else: - seg = ['0']*len(angles) + seg = ['0'] * len(angles) # Construct label names - labels = [ ang+' '+seg[i]+' '+names[i]+' '+nums[i] for i, ang in enumerate(angles)] + labels = [ang + ' ' + seg[i] + ' ' + names[i] + ' ' + nums[i] for i, ang in enumerate(angles)] # Convert to radians if so desired if radians: - values = torsions[1] * np.pi/180 + values = torsions[1] * np.pi / 180 else: values = torsions[1] return labels, values def get_protein_backbone_torsions(pdb, xtc, selection='all', - first_frame=0, last_frame=None, step=1, - naming='segindex', radians=False, - include_omega=False): + first_frame=0, last_frame=None, step=1, + naming='segindex', radians=False, + include_omega=False): """ Load protein backbone torsions PHI (φ): C(i-1)-N(i)-CA(i)-C(i) - PSI (ψ): N(i)-CA(i)-C(i)-N(i+1) - OMEGA (ω): CA(i)-C(i)-N(i+1)-CA(i+1) + PSI (ψ): N(i)-CA(i)-C(i)-N(i + 1) + OMEGA (ω): CA(i)-C(i)-N(i + 1)-CA(i + 1) Parameters ---------- @@ -491,14 +429,14 @@ def get_protein_backbone_torsions(pdb, xtc, selection='all', rel_res=[-1, 0, 0, 0], selection=selection ) - # PSI (ψ): N(i)-CA(i)-C(i)-N(i+1) + # PSI (ψ): N(i)-CA(i)-C(i)-N(i + 1) indices_psi = find_atom_indices_per_residue( pdb, at_names=["N", "CA", "C", "N"], rel_res=[0, 0, 0, 1], selection=selection ) - # OMEGA (ω): CA(i)-C(i)-N(i+1)-CA(i+1) + # OMEGA (ω): CA(i)-C(i)-N(i + 1)-CA(i + 1) indices_omega = find_atom_indices_per_residue( pdb, at_names=["CA", "C", "N", "CA"], @@ -506,29 +444,29 @@ def get_protein_backbone_torsions(pdb, xtc, selection='all', selection=selection ) # Define angle names for labels - angles = ['PHI']*len(indices_phi) + ['PSI']*len(indices_psi) - torsion_selection = indices_phi+indices_psi + angles = ['PHI'] * len(indices_phi) + ['PSI'] * len(indices_psi) + torsion_selection = indices_phi + indices_psi if include_omega: - angles += ['OMEGA']*len(indices_omega) + angles += ['OMEGA'] * len(indices_omega) torsion_selection += indices_omega # Calculate the torsions torsions = get_torsions( pdb, xtc, sel=torsion_selection, first_frame=0, last_frame=None, step=1, - naming = naming + naming=naming ) # Extract the residue info nums = [pti.split(' - ')[1].split(' ')[-2] for pti in torsions[0]] names = [pti.split(' - ')[1].split(' ')[-3] for pti in torsions[0]] - if naming=='chainid' or naming=='segid' or naming=='segindex': + if naming == 'chainid' or naming == 'segid' or naming == 'segindex': seg = [pti.split(' - ')[1].split(' ')[-4] for pti in torsions[0]] else: - seg = ['0']*len(angles) + seg = ['0'] * len(angles) # Construct label names - labels = [ ang+' '+seg[i]+' '+names[i]+' '+nums[i] for i, ang in enumerate(angles)] + labels = [ang + ' ' + seg[i] + ' ' + names[i] + ' ' + nums[i] for i, ang in enumerate(angles)] # Convert to radians if so desired if radians: - values = torsions[1] * np.pi/180 + values = torsions[1] * np.pi / 180 else: values = torsions[1] return labels, values @@ -614,11 +552,11 @@ def get_protein_sidechain_torsions(pdb, xtc, selection='all', ) # Define angle names for labels angles = [] - angles += ['CHI1']*len(indices_chi1) - angles += ['CHI2']*len(indices_chi2) - angles += ['CHI3']*len(indices_chi3) - angles += ['CHI4']*len(indices_chi4) - angles += ['CHI5']*len(indices_chi5) + angles += ['CHI1'] * len(indices_chi1) + angles += ['CHI2'] * len(indices_chi2) + angles += ['CHI3'] * len(indices_chi3) + angles += ['CHI4'] * len(indices_chi4) + angles += ['CHI5'] * len(indices_chi5) # Define torsion selections torsion_selection = [] torsion_selection += indices_chi1 @@ -630,20 +568,20 @@ def get_protein_sidechain_torsions(pdb, xtc, selection='all', torsions = get_torsions( pdb, xtc, sel=torsion_selection, first_frame=0, last_frame=None, step=1, - naming = naming + naming=naming ) # Extract the residue info nums = [pti.split(' - ')[1].split(' ')[-2] for pti in torsions[0]] names = [pti.split(' - ')[1].split(' ')[-3] for pti in torsions[0]] - if naming=='chainid' or naming=='segid' or naming=='segindex': + if naming == 'chainid' or naming == 'segid' or naming == 'segindex': seg = [pti.split(' - ')[1].split(' ')[-4] for pti in torsions[0]] else: - seg = ['0']*len(angles) + seg = ['0'] * len(angles) # Construct label names - labels = [ ang+' '+seg[i]+' '+names[i]+' '+nums[i] for i, ang in enumerate(angles)] + labels = [ang + ' ' + seg[i] + ' ' + names[i] + ' ' + nums[i] for i, ang in enumerate(angles)] # Convert to radians if so desired if radians: - values = torsions[1] * np.pi/180 + values = torsions[1] * np.pi / 180 else: values = torsions[1] return labels, values diff --git a/pensa/features/processing.py b/pensa/features/processing.py index 8191fe1..adb9340 100644 --- a/pensa/features/processing.py +++ b/pensa/features/processing.py @@ -1,8 +1,4 @@ import numpy as np -import scipy as sp -import scipy.stats -import scipy.spatial -import scipy.spatial.distance import os from pensa.preprocessing import sort_coordinates @@ -192,11 +188,11 @@ def get_multivar_res_timeseries(feat, data, feature_type, write=None, out_name=N resname = feat_name_split[-2] + ' ' + feat_name_split[-1] sorted_names.append(resname) if write is True: - for subdir in [feature_type+'/']: + for subdir in [feature_type + '/']: if not os.path.exists(subdir): os.makedirs(subdir) resname_out = feat_name_split[-2] + feat_name_split[-1] - filename = feature_type+'/' + out_name + resname_out + ".txt" + filename = feature_type + '/' + out_name + resname_out + ".txt" np.savetxt(filename, feat_timeseries, delimiter=', ', newline='\n') # return multivar_res_timeseries_data @@ -447,10 +443,10 @@ def correct_spher_angle_periodicity(two_angles): psi_heights = np.histogram(psi_continuous_angles, bins=90, density=True) # Shift everything before bin with minimum height by periodic amount φ ∈ [0, 2π) - psi_shift = 2*np.pi + psi_shift = 2 * np.pi if psi_heights[0][0] > min(psi_heights[0]): perbound = psi_heights[1][np.where( - psi_heights[0] == min(psi_heights[0]))[0][0]+1] + psi_heights[0] == min(psi_heights[0]))[0][0] + 1] for angle_index in range(len(psi_continuous_angles)): if psi_continuous_angles[angle_index] < perbound: @@ -468,10 +464,10 @@ def correct_spher_angle_periodicity(two_angles): theta_heights = np.histogram( theta_continuous_angles, bins=90, density=True) # Shift everything before bin with minimum height by periodic amount θ ∈ [0, π] - theta_shift = 2*np.pi + theta_shift = 2 * np.pi if theta_heights[0][0] > min(theta_heights[0]): perbound = theta_heights[1][np.where( - theta_heights[0] == min(theta_heights[0]))[0][0]+1] + theta_heights[0] == min(theta_heights[0]))[0][0] + 1] for angle_index in range(len(theta_continuous_angles)): if theta_continuous_angles[angle_index] < perbound: theta_continuous_angles[angle_index] = theta_shift - \ @@ -502,11 +498,11 @@ def correct_angle_periodicity(angle): heights = np.histogram(new_angle, bins=90, density=True) # Shift everything before bin with minimum height by periodic amount if heights[0][0] > min(heights[0]): - perbound = heights[1][np.where(heights[0] == min(heights[0]))[0][0]+1] + perbound = heights[1][np.where(heights[0] == min(heights[0]))[0][0] + 1] # print(perbound) for angle_index in range(len(new_angle)): if new_angle[angle_index] < perbound: - new_angle[angle_index] += 2*np.pi + new_angle[angle_index] += 2 * np.pi return new_angle @@ -542,7 +538,7 @@ def sort_traj_along_feature(feat, data, feature_name, ref_name, trj_name, out_na """ if verbose: - print('Sorting along feature '+feature_name) + print('Sorting along feature ' + feature_name) d = get_feature_data(feat, data, feature_name) d_sorted, sort_idx, oidx_sort = sort_coordinates( d, ref_name, trj_name, out_name, start_frame=start_frame) diff --git a/pensa/features/pyemma_features.py b/pensa/features/pyemma_features.py index 202f69d..190d685 100644 --- a/pensa/features/pyemma_features.py +++ b/pensa/features/pyemma_features.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# - * - coding: utf-8 - * - """ Methods to featurize a protein, based on PyEMMA. @@ -92,7 +92,7 @@ def _describe_dist_without_atom_numbers(feature_names): """ desc = feature_names.describe() - desc = [ _remove_atom_numbers_from_distance(d) for d in desc ] + desc = [_remove_atom_numbers_from_distance(d) for d in desc] return desc @@ -116,7 +116,7 @@ def _remove_atom_numbers_from_distance(feat_str): # Glue the desired parts back together new_feat = parts[0] for nr in [1, 2, 3, 5, 6, 7, 8]: - new_feat += ' '+parts[nr] + new_feat += ' ' + parts[nr] return new_feat @@ -144,22 +144,22 @@ def _remove_resnum_offset(features, offset): if 'bb-torsions' in features.keys(): for f in features['bb-torsions']: fsplit = f.split(' ') - resnum = int(f.split(' ')[3])-offset + resnum = int(f.split(' ')[3]) - offset fsplit[3] = str(resnum) new_features['bb-torsions'].append(' '.join(fsplit)) if 'sc-torsions' in features.keys(): for f in features['sc-torsions']: fsplit = f.split(' ') - resnum = int(f.split(' ')[3])-offset + resnum = int(f.split(' ')[3]) - offset fsplit[3] = str(resnum) new_features['sc-torsions'].append(' '.join(fsplit)) if 'bb-distances' in features.keys(): for f in features['bb-distances']: fsplit = f.split(' ') - resnum1 = int(f.split(' ')[2])-offset - resnum2 = int(f.split(' ')[6])-offset + resnum1 = int(f.split(' ')[2]) - offset + resnum2 = int(f.split(' ')[6]) - offset fsplit[2] = str(resnum1) fsplit[6] = str(resnum2) new_features['bb-distances'].append(' '.join(fsplit)) @@ -201,4 +201,3 @@ def sort_traj_along_pyemma_feature(feat, data, feature_name, feature_type, ref_n sort_idx, oidx_sort = sort_coordinates(d, ref_name, trj_name, out_name, start_frame=start_frame) d_sorted = d[sort_idx] return d_sorted - diff --git a/pensa/features/water_features.py b/pensa/features/water_features.py index f967c90..c3939c3 100644 --- a/pensa/features/water_features.py +++ b/pensa/features/water_features.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# - * - coding: utf-8 - * - """ Methods to obtain a timeseries distribution for the water pockets which respresents a combination of the water occupancy (binary variable) and the water polarisation (continuous variable). @@ -21,7 +21,8 @@ from gridData import Grid from tqdm import tqdm import os -from pensa.preprocessing.density import * +from pensa.preprocessing.density import \ + convert_to_occ, data_out, write_atom_to_pdb, get_grid, local_maxima_3D def _convert_to_dipole(water_atom_positions): @@ -42,29 +43,30 @@ def _convert_to_dipole(water_atom_positions): """ - ## Coordinates for water atoms + # Coordinates for water atoms Ot0 = np.array(water_atom_positions[0]) H1t0 = np.array(water_atom_positions[1]) H2t0 = np.array(water_atom_positions[2]) - ## Dipole vector + # Dipole vector dipVector0 = (H1t0 + H2t0) * 0.5 - Ot0 - x_axis=dipVector0[0] - y_axis=dipVector0[1] - z_axis=dipVector0[2] + x_axis = dipVector0[0] + y_axis = dipVector0[1] + z_axis = dipVector0[2] - ## Convert to spherical coordinates - ## radians + # Convert to spherical coordinates + # radians # φ ∈ [0, 2π) - psi=np.arctan2(y_axis, x_axis) + psi = np.arctan2(y_axis, x_axis) # θ ∈ [0, π] - theta=np.arccos(z_axis/(np.sqrt(x_axis**2+y_axis**2+z_axis**2))) - ## degrees + theta = np.arccos(z_axis / (np.sqrt(x_axis ** 2 + y_axis ** 2 + z_axis ** 2))) + # degrees # psi=math.degrees(np.arctan2(y_axis, x_axis)) - # theta=math.degrees(np.arccos(z_axis/(np.sqrt(x_axis**2+y_axis**2+z_axis**2)))) + # theta=math.degrees(np.arccos(z_axis/(np.sqrt(x_axis * *2 + y_axis * *2 + z_axis * *2)))) return psi, theta + def get_water_features(structure_input, xtc_input, atomgroup, top_waters=10, grid_input=None, write=None, write_grid_as=None, out_name=None): """ @@ -128,7 +130,7 @@ def get_water_features(structure_input, xtc_input, atomgroup, top_waters=10, u.trajectory.rewind() if grid_input is None: - g = get_grid(u, atomgroup, write_grid_as, out_name) + g = get_grid(u, atomgroup, write_grid_as, out_name) else: g = Grid(grid_input) elif grid_input is None: @@ -137,78 +139,75 @@ def get_water_features(structure_input, xtc_input, atomgroup, top_waters=10, g = Grid(grid_input) xyz, val = local_maxima_3D(g.grid) - ## Negate the array to get probabilities in descending order - val_sort = np.argsort(-1*val.copy()) + # Negate the array to get probabilities in descending order + val_sort = np.argsort(-1 * val.copy()) coords = [xyz[max_val] for max_val in val_sort] - maxdens_coord_str = [str(item)[1:-1] for item in coords] - water_information=[] - water_dists=[] + # maxdens_coord_str = [str(item)[1:-1] for item in coords] + water_information = [] + water_dists = [] if top_waters > len(coords): top_waters = len(coords) - print('\n') print('Featurizing ', top_waters, ' Waters') for wat_no in range(top_waters): print('\n') - print('Water no: ', wat_no+1) + print('Water no: ', wat_no + 1) print('\n') - thetalist=[] - psilist=[] + thetalist = [] + psilist = [] - ## Find all water atoms within 3.5 Angstroms of density maxima + # Find all water atoms within 3.5 Angstroms of density maxima # Shifting the coordinates of the maxima by the grid origin to match # the simulation box coordinates - shifted_coords=coords[wat_no]+g.origin + shifted_coords = coords[wat_no] + g.origin point_str = str(shifted_coords)[1:-1] - counting=[] + counting = [] for frame_no in tqdm(range(len(u.trajectory))): - # for frame_no in tqdm(range(100)): u.trajectory[frame_no] radius = ' 3.5' atomgroup_IDS = u.select_atoms('name ' + atomgroup + ' and point ' + point_str + radius).indices counting.append(atomgroup_IDS) - ## Water atom indices that appear in the water site + # Water atom indices that appear in the water site flat_list = [item for sublist in counting for item in sublist] - ## Extract water orientation timeseries + # Extract water orientation timeseries for frame_no in tqdm(range(len(u.trajectory))): - # for frame_no in tqdm(range(100)): u.trajectory[frame_no] - waters_resid=counting[frame_no] - if len(waters_resid)==1: - ## (x, y, z) positions for the water oxygen at trajectory frame_no + waters_resid = counting[frame_no] + if len(waters_resid) == 1: + # (x, y, z) positions for the water oxygen at trajectory frame_no water_atom_positions = [list(pos) for pos in u.select_atoms('byres index ' + str(waters_resid[0])).positions] psi, theta = _convert_to_dipole(water_atom_positions) psilist.append(psi) thetalist.append(theta) - ## Featurize water with highest pocket occupation (if multiple waters in pocket) - elif len(waters_resid)>1: - freq_count=[] + # Featurize water with highest pocket occupation (if multiple waters in pocket) + elif len(waters_resid) > 1: + freq_count = [] for ID in waters_resid: freq_count.append([flat_list.count(ID), ID]) - freq_count.sort(key = lambda x: x[0]) + freq_count.sort(key=lambda x: x[0]) water_atom_positions = [list(pos) for pos in u.select_atoms('byres index ' + str(freq_count[-1][1])).positions] psi, theta = _convert_to_dipole(water_atom_positions) psilist.append(psi) thetalist.append(theta) - ## 10000.0 = no waters bound - elif len(waters_resid)<1: + # 10000.0 = no waters bound + elif len(waters_resid) < 1: psilist.append(10000.0) thetalist.append(10000.0) water_out = [psilist, thetalist] water_dists.append(water_out) - water_ID = "O" + str(wat_no+1) - water_pocket_occupation_frequency = 1 - psilist.count(10000.0)/len(psilist) + water_ID = "O" + str(wat_no + 1) + water_pocket_occupation_frequency = 1 - psilist.count(10000.0) / len(psilist) water_pocket_occupation_frequency = round(water_pocket_occupation_frequency, 4) atom_location = shifted_coords water_information.append([water_ID, list(atom_location), water_pocket_occupation_frequency]) - ## Write data out and visualize water sites in pdb + # Write data out and visualize water sites in pdb if write is True: data_out('water_features/' + out_name + water_ID + '.txt', water_out) data_out('water_features/' + out_name + 'WatersSummary.txt', water_information) @@ -217,31 +216,29 @@ def get_water_features(structure_input, xtc_input, atomgroup, top_waters=10, u_pdb.add_TopologyAttr('tempfactors') # Write values as beta-factors ("tempfactors") to a PDB file for res in range(len(water_information)): - #scale the water resid by the starting resid - water_resid = len(u_pdb.residues) - wat_no-1 + res + # scale the water resid by the starting resid + water_resid = len(u_pdb.residues) - wat_no - 1 + res u_pdb.residues[water_resid].atoms.tempfactors = water_information[res][-1] u_pdb.atoms.write(pdb_outname) # Add water pocket orientations - feature_names['WaterPocket_Distr']= [watinf[0] for watinf in water_information] - features_data['WaterPocket_Distr']= np.array(water_dists, dtype=object) + feature_names['WaterPocket_Distr'] = [watinf[0] for watinf in water_information] + features_data['WaterPocket_Distr'] = np.array(water_dists, dtype=object) - occup=[watinf[2] for watinf in water_information] + occup = [watinf[2] for watinf in water_information] # Add water pocket occupancies - feature_names['WaterPocket_Occup']= [watinf[0] for watinf in water_information] - features_data['WaterPocket_Occup']= np.array(occup, dtype=object) + feature_names['WaterPocket_Occup'] = [watinf[0] for watinf in water_information] + features_data['WaterPocket_Occup'] = np.array(occup, dtype=object) - occup_distr=[[convert_to_occ(distr[0], 10000.0)] for distr in water_dists] + occup_distr = [[convert_to_occ(distr[0], 10000.0)] for distr in water_dists] # Add water pocket occupancy timeseries - feature_names['WaterPocket_OccupDistr']= [watinf[0] for watinf in water_information] - features_data['WaterPocket_OccupDistr']= np.array(occup_distr, dtype=object) + feature_names['WaterPocket_OccupDistr'] = [watinf[0] for watinf in water_information] + features_data['WaterPocket_OccupDistr'] = np.array(occup_distr, dtype=object) - loc=[watinf[1] for watinf in water_information] + loc = [watinf[1] for watinf in water_information] # Add water pocket locations - feature_names['WaterPocket_xyz']= [watinf[0] for watinf in water_information] - features_data['WaterPocket_xyz']= np.array(loc, dtype=object) + feature_names['WaterPocket_xyz'] = [watinf[0] for watinf in water_information] + features_data['WaterPocket_xyz'] = np.array(loc, dtype=object) # Return the dictionaries. return feature_names, features_data - - diff --git a/pensa/preprocessing/download.py b/pensa/preprocessing/download.py index ab4e11d..e2b8f52 100644 --- a/pensa/preprocessing/download.py +++ b/pensa/preprocessing/download.py @@ -45,16 +45,16 @@ def get_transmem_from_uniprot(uniprot_id): List of all transmembrane regions, represented as tuples with first and last residue ID. """ - url = 'https://www.uniprot.org/uniprot/'+uniprot_id+'.txt' + url = 'https://www.uniprot.org/uniprot/' + uniprot_id + '.txt' r = requests.get(url, allow_redirects=True) c = r.content tm = [] for line in c.splitlines(): if line.startswith(b'FT TRANSMEM'): - l = str(line) - l = l.replace('b\'FT TRANSMEM ', '') - l = l.replace('\'', '') - s = l.split('.') + new_line = str(line) + new_line = new_line.replace('b\'FT TRANSMEM ', '') + new_line = new_line.replace('\'', '') + s = new_line.split('.') tm.append((int(s[0]), int(s[-1]))) for tmi in tm: print(*tmi) diff --git a/pensa/preprocessing/selection.py b/pensa/preprocessing/selection.py index 24efff6..c99c309 100644 --- a/pensa/preprocessing/selection.py +++ b/pensa/preprocessing/selection.py @@ -18,7 +18,7 @@ def range_to_string(a, b): String containing all int numbers from a to b. """ - r = np.arange(a, b+1) + r = np.arange(a, b + 1) string = '' for ri in r: string += str(ri) @@ -50,5 +50,3 @@ def load_selection(sel_file, sel_base=''): r = np.array(line.strip().split(' '), dtype=int) sel_string += range_to_string(*r) return sel_string - - diff --git a/scripts/calculate_combined_clusters.py b/scripts/calculate_combined_clusters.py index b4fa42c..b9818ff 100644 --- a/scripts/calculate_combined_clusters.py +++ b/scripts/calculate_combined_clusters.py @@ -75,7 +75,7 @@ # Write indices to results file np.savetxt(args.out_results+'_combined-cluster-indices.csv', np.array([cidx, cond, oidx], dtype=int).T, - delimiter=',', fmt='%i', + delimiter=', ', fmt='%i', header='Cluster, Condition, Index within condition') # Write out frames for each cluster for each simulation diff --git a/scripts/calculate_combined_principal_components.py b/scripts/calculate_combined_principal_components.py index 5ec157d..770f37e 100644 --- a/scripts/calculate_combined_principal_components.py +++ b/scripts/calculate_combined_principal_components.py @@ -65,7 +65,7 @@ plot_file=args.out_plots+"_"+ftype+"_eigenvalues_combined.pdf") # Save them to a CSV file np.savetxt(args.out_results+"_"+ftype+"_eigenvalues_combined.csv", np.array([cn, ev]).T, - delimiter=',', header='Component, Eigenvalue') + delimiter=', ', header='Component, Eigenvalue') # Plot feature correlation with top components and print relevant features pca_features(pca, feat_a[ftype], args.num_components, args.feat_threshold, diff --git a/scripts/compare_feature_distributions.py b/scripts/compare_feature_distributions.py index 6df5399..4f01f4f 100644 --- a/scripts/compare_feature_distributions.py +++ b/scripts/compare_feature_distributions.py @@ -32,7 +32,7 @@ def workflow_torsions_jsd(args, feat_a, feat_b, data_a, data_b, tors='bb'): # Save all results (per feature) in CSV files np.savetxt(args.out_results+'_'+tors+'-torsions_relative-entropy.csv', np.array(relen).T, - fmt='%s', delimiter=',', header='Name, JSD(A,B), KLD(A,B), KLD(B,A)') + fmt='%s', delimiter=', ', header='Name, JSD(A, B), KLD(A, B), KLD(B, A)') # Save the Jensen-Shannon distance as "B factor" in a PDB file vis = residue_visualization(names, jsd, args.ref_file_a, @@ -71,7 +71,7 @@ def workflow_torsions_kss(args, feat_a, feat_b, data_a, data_b, tors='bb'): # Save all results (per feature) in CSV files np.savetxt(args.out_results+'_'+tors+'-torsions_kolmogorov-smirnov.csv', np.array(ksana).T, - fmt='%s', delimiter=',', header='Name, KSS(A,B), p-value') + fmt='%s', delimiter=', ', header='Name, KSS(A, B), p-value') # Save the Kolmogorov-Smirnov statistic as "B factor" in a PDB file vis = residue_visualization(names, kss, args.ref_file_a, @@ -105,7 +105,7 @@ def workflow_torsions_ssi(args, feat_a, feat_b, data_a, data_b, tors='bb'): # Save all results (per feature) in CSV files np.savetxt(args.out_results+'_'+tors+'-torsions_state-specific-information.csv', np.array(ana).T, - fmt='%s', delimiter=',', header='Name, SSI(A,B)') + fmt='%s', delimiter=', ', header='Name, SSI(A, B)') # Save the state-specific information as "B factor" in a PDB file vis = residue_visualization(resnames, ssi, args.ref_file_a, @@ -197,7 +197,7 @@ def workflow_torsions_ssi(args, feat_a, feat_b, data_a, data_b, tors='bb'): # Save all results (per feature) in a CSV file np.savetxt(args.out_results+'_bb-distances_relative-entropy.csv', np.array(relen).T, - fmt='%s', delimiter=',', header='Name, JSD(A,B), KLD(A,B), KLD(B,A)') + fmt='%s', delimiter=', ', header='Name, JSD(A, B), KLD(A, B), KLD(B, A)') # Print the features with the highest values print("Backbone C-alpha distances with the strongest deviations:") @@ -218,7 +218,7 @@ def workflow_torsions_ssi(args, feat_a, feat_b, data_a, data_b, tors='bb'): # Save all results (per feature) in a CSV file np.savetxt(args.out_results+'_bb-distances_difference-of-mean.csv', np.array(meanda).T, - fmt='%s', delimiter=',', header='Name, average, difference') + fmt='%s', delimiter=', ', header='Name, average, difference') # Sort the distances by their differences print("Backbone C-alpha distances with the strongest differences of their mean value:") diff --git a/scripts/extract_coordinates.py b/scripts/extract_coordinates.py index 7bb8839..68e82e2 100644 --- a/scripts/extract_coordinates.py +++ b/scripts/extract_coordinates.py @@ -15,17 +15,17 @@ parser.add_argument("--pdb_file", type=str, default='system.pdb') parser.add_argument("--trj_file", type=str, default='stitched_310.nc', nargs='+') parser.add_argument("--out_name", type=str, default='coordinates' ) - parser.add_argument("--start_frame",type=int, default=0 ) + parser.add_argument("--start_frame", type=int, default=0 ) args = parser.parse_args() # Load the selection and generate the string - if len(args.sel_file) > 0: + if len(args.sel_file) > 0: sel_string = load_selection(args.sel_file, args.sel_base) else: sel_string = args.sel_base print(sel_string) # Extract the coordinates from the trajectory - extract_coordinates(args.ref_file, args.pdb_file, args.trj_file, + extract_coordinates(args.ref_file, args.pdb_file, args.trj_file, args.out_name, sel_string, args.start_frame) diff --git a/scripts/extract_coordinates_combined.py b/scripts/extract_coordinates_combined.py index c2dca10..9c12fe9 100644 --- a/scripts/extract_coordinates_combined.py +++ b/scripts/extract_coordinates_combined.py @@ -17,7 +17,7 @@ parser.add_argument("--ref_file_b", type=str, default='system_b.psf', nargs='+') parser.add_argument("--trj_file_b", type=str, default='stitched_b.nc', nargs='+') parser.add_argument("--out_name", type=str, default='coordinates' ) - parser.add_argument("--start_frame",type=int, default=0 ) + parser.add_argument("--start_frame", type=int, default=0 ) args = parser.parse_args() # Load the selection and generate the strings @@ -30,15 +30,15 @@ print(args.trj_file_a) print(args.trj_file_b) - + # Combine the lists of input files and selections assert len(args.ref_file_a) == len(args.trj_file_a) assert len(args.ref_file_b) == len(args.trj_file_b) ref_file_list = args.ref_file_a + args.ref_file_b trj_file_list = args.trj_file_a + args.trj_file_b sel_string_list = [sel_string_a]*len(args.ref_file_a) + [sel_string_b]*len(args.ref_file_b) - + # Extract the coordinates from the trajectories - extract_coordinates_combined(ref_file_list, trj_file_list, sel_string_list, args.out_name, + extract_coordinates_combined(ref_file_list, trj_file_list, sel_string_list, args.out_name, start_frame=args.start_frame) diff --git a/scripts/get_tutorial_datasets.py b/scripts/get_tutorial_datasets.py index 291a681..2d271af 100644 --- a/scripts/get_tutorial_datasets.py +++ b/scripts/get_tutorial_datasets.py @@ -6,7 +6,7 @@ def subsample(psf_file, xtc_file, out_file): - u = mda.Universe(psf_file,xtc_file) + u = mda.Universe(psf_file, xtc_file) print(len(u.trajectory)) a = u.select_atoms('all') with mda.Writer(out_file, a.n_atoms) as W: @@ -33,11 +33,11 @@ def subsample(psf_file, xtc_file, out_file): # MOR-apo psf_file = '11427_dyn_151.psf' pdb_file = '11426_dyn_151.pdb' - download_from_gpcrmd(psf_file,root) - download_from_gpcrmd(pdb_file,root) - for sim in ['11423_trj_151','11424_trj_151','11425_trj_151']: + download_from_gpcrmd(psf_file, root) + download_from_gpcrmd(pdb_file, root) + for sim in ['11423_trj_151', '11424_trj_151', '11425_trj_151']: xtc_file = sim+'.xtc' - download_from_gpcrmd(xtc_file,root) + download_from_gpcrmd(xtc_file, root) if args.subsample: out_file = sim+'_subsampled.xtc' subsample(root+psf_file, root+xtc_file, root+out_file) @@ -45,11 +45,11 @@ def subsample(psf_file, xtc_file, out_file): # MOR-BU72 psf_file = '11580_dyn_169.psf' pdb_file = '11579_dyn_169.pdb' - download_from_gpcrmd(psf_file,root) - download_from_gpcrmd(pdb_file,root) - for sim in ['11576_trj_169','11577_trj_169','11578_trj_169']: + download_from_gpcrmd(psf_file, root) + download_from_gpcrmd(pdb_file, root) + for sim in ['11576_trj_169', '11577_trj_169', '11578_trj_169']: xtc_file = sim+'.xtc' - download_from_gpcrmd(xtc_file,root) + download_from_gpcrmd(xtc_file, root) if args.subsample: out_file = sim+'_subsampled.xtc' subsample(root+psf_file, root+xtc_file, root+out_file) diff --git a/tests/test_features.py b/tests/test_features.py index 94871d2..e254eac 100644 --- a/tests/test_features.py +++ b/tests/test_features.py @@ -1,102 +1,106 @@ -import pytest -import os -import importlib -from pensa.features import * +from pensa.features import \ + get_protein_backbone_torsions, \ + get_protein_sidechain_torsions, \ + get_nucleicacid_backbone_torsions, \ + get_nucleicacid_pseudotorsions # Location of the data used in the tests test_data_path = './tests/test_data' # Simulation A (MOR-apo) -root_dir_a = test_data_path+'/MOR-apo' -ref_file_a = root_dir_a+'/mor-apo.psf' -pdb_file_a = root_dir_a+'/mor-apo.pdb' -trj_file_a = [root_dir_a+'/mor-apo-1.xtc', - root_dir_a+'/mor-apo-2.xtc', - root_dir_a+'/mor-apo-3.xtc'] +root_dir_a = test_data_path + '/MOR-apo' +ref_file_a = root_dir_a + '/mor-apo.psf' +pdb_file_a = root_dir_a + '/mor-apo.pdb' +trj_file_a = [root_dir_a + '/mor-apo-1.xtc', + root_dir_a + '/mor-apo-2.xtc', + root_dir_a + '/mor-apo-3.xtc'] # Simulation B (MOR-BU72) -root_dir_b = test_data_path+'/MOR-BU72' -ref_file_b = root_dir_b+'/mor-bu72.psf' -pdb_file_b = root_dir_b+'/mor-bu72.pdb' -trj_file_b = [root_dir_b+'/mor-bu72-1.xtc', - root_dir_b+'/mor-bu72-2.xtc', - root_dir_b+'/mor-bu72-3.xtc'] +root_dir_b = test_data_path + '/MOR-BU72' +ref_file_b = root_dir_b + '/mor-bu72.psf' +pdb_file_b = root_dir_b + '/mor-bu72.pdb' +trj_file_b = [root_dir_b + '/mor-bu72-1.xtc', + root_dir_b + '/mor-bu72-2.xtc', + root_dir_b + '/mor-bu72-3.xtc'] # Simulation O (DNA-opt) -root_dir_o = test_data_path+'/DNA-opt' -gro_file_o = root_dir_o+'/DNA-nw.gro' -trj_file_o = root_dir_o+'/trajfit-nw_step100.xtc' +root_dir_o = test_data_path + '/DNA-opt' +gro_file_o = root_dir_o + '/DNA-nw.gro' +trj_file_o = root_dir_o + '/trajfit-nw_step100.xtc' # Simulation S (DNA-std) -root_dir_s = test_data_path+'/DNA-std' -gro_file_s = root_dir_s+'/DNA-nw.gro' -trj_file_s = root_dir_s+'/trajfit-nw_step100.xtc' +root_dir_s = test_data_path + '/DNA-std' +gro_file_s = root_dir_s + '/DNA-nw.gro' +trj_file_s = root_dir_s + '/trajfit-nw_step100.xtc' + # Test protein backbone torsions def test_get_protein_backbone_torsions(): # Feature Loaders bb_torsions_a = get_protein_backbone_torsions( pdb_file_a, trj_file_a[0], selection='all', - first_frame=0, last_frame=None, step=1, + first_frame=0, last_frame=None, step=1, naming='segindex', radians=True, include_omega=False - ) + ) bb_torsions_b = get_protein_backbone_torsions( pdb_file_b, trj_file_b[0], selection='all', - first_frame=0, last_frame=None, step=1, + first_frame=0, last_frame=None, step=1, naming='segindex', radians=True, include_omega=False - ) + ) assert len(bb_torsions_a) == len(bb_torsions_b) assert bb_torsions_a[0][0] == bb_torsions_b[0][0] == 'PHI 0 VAL 66' assert bb_torsions_a[0][-1] == bb_torsions_b[0][-1] == 'PSI 0 CYS 351' - assert '%1.4f %1.4f'%(bb_torsions_a[1][0][0], bb_torsions_a[1][0][-1]) == '-1.0394 -0.7756' - assert '%1.4f %1.4f'%(bb_torsions_a[1][-1][0], bb_torsions_a[1][-1][-1]) == '-1.0648 -0.5028' + assert '%1.4f %1.4f' % (bb_torsions_a[1][0][0], bb_torsions_a[1][0][-1]) == '-1.0394 -0.7756' + assert '%1.4f %1.4f' % (bb_torsions_a[1][-1][0], bb_torsions_a[1][-1][-1]) == '-1.0648 -0.5028' + # Test protein side-chain torsions def test_get_protein_sidechain_torsions(): # Feature Loaders sc_torsions_a = get_protein_sidechain_torsions( pdb_file_a, trj_file_a[0], selection='all', - first_frame=0, last_frame=None, step=1, + first_frame=0, last_frame=None, step=1, naming='segindex', radians=True - ) + ) sc_torsions_b = get_protein_sidechain_torsions( pdb_file_b, trj_file_b[0], selection='all', - first_frame=0, last_frame=None, step=1, + first_frame=0, last_frame=None, step=1, naming='segindex', radians=True - ) + ) assert len(sc_torsions_a) == len(sc_torsions_b) assert sc_torsions_a[0][0] == sc_torsions_b[0][0] == 'CHI1 0 MET 65' assert sc_torsions_a[0][-1] == sc_torsions_b[0][-1] == 'CHI5 0 ARG 348' - assert '%1.4f %1.4f'%(sc_torsions_a[1][0][0], sc_torsions_a[1][0][-1]) == '1.0151 -0.2732' - assert '%1.4f %1.4f'%(sc_torsions_a[1][-1][0], sc_torsions_a[1][-1][-1]) == '-0.7536 0.1536' + assert '%1.4f %1.4f' % (sc_torsions_a[1][0][0], sc_torsions_a[1][0][-1]) == '1.0151 -0.2732' + assert '%1.4f %1.4f' % (sc_torsions_a[1][-1][0], sc_torsions_a[1][-1][-1]) == '-0.7536 0.1536' + # Test nucleic acid backbone torsions def test_get_nucleicacid_backbone_torsions(): bb_torsions_o = get_nucleicacid_backbone_torsions( gro_file_o, trj_file_o, selection='all', - first_frame=0, last_frame=None, step=1, + first_frame=0, last_frame=None, step=1, naming='segindex', radians=True - ) + ) bb_torsions_s = get_nucleicacid_backbone_torsions( gro_file_s, trj_file_s, selection='all', - first_frame=0, last_frame=None, step=1, + first_frame=0, last_frame=None, step=1, naming='segindex', radians=True - ) + ) assert bb_torsions_o[0] == bb_torsions_s[0] + # Test nucleic acid pseudo-torsions def test_get_nucleicacid_pseudotorsions(): pseudotorsions_o = get_nucleicacid_pseudotorsions( gro_file_o, trj_file_o, selection='all', - first_frame=0, last_frame=None, step=1, + first_frame=0, last_frame=None, step=1, naming='segindex', radians=True - ) + ) pseudotorsions_s = get_nucleicacid_pseudotorsions( gro_file_s, trj_file_s, selection='all', - first_frame=0, last_frame=None, step=1, + first_frame=0, last_frame=None, step=1, naming='segindex', radians=True - ) + ) pseudotorsions_o[0] == pseudotorsions_s[0] - diff --git a/tests/test_workflow.py b/tests/test_workflow.py index f66f999..38e4c67 100644 --- a/tests/test_workflow.py +++ b/tests/test_workflow.py @@ -1,112 +1,129 @@ -import unittest, gc, os +import unittest +import gc +import os import matplotlib.pyplot as plt import numpy as np -from pensa.clusters import * -from pensa.comparison import * -from pensa.features import * -from pensa.dimensionality import * -from pensa.preprocessing import * +from pensa.clusters import \ + obtain_clusters, wss_over_number_of_clusters, \ + obtain_combined_clusters, wss_over_number_of_combined_clusters, \ + write_cluster_traj +from pensa.comparison import \ + get_discrete_states, \ + relative_entropy_analysis, relen_block_analysis, relen_sem_analysis, \ + ssi_ensemble_analysis, ssi_block_analysis, ssi_sem_analysis, \ + residue_visualization, distances_visualization +from pensa.features import \ + get_structure_features, \ + get_multivar_res_timeseries, \ + sort_features +from pensa.dimensionality import \ + calculate_pca, pca_features, pca_eigenvalues_plot, \ + calculate_tica, tica_features, tica_eigenvalues_plot, \ + sort_traj_along_pc, sort_trajs_along_common_pc, \ + sort_traj_along_tic, sort_trajs_along_common_tic, \ + compare_projections +from pensa.preprocessing import \ + load_selection, extract_coordinates, extract_coordinates_combined # Location of the data used in the tests test_data_path = './tests/test_data' # Locations for the output files -for subdir in ['traj','plots','vispdb','pca','tica','clusters','results']: - if not os.path.exists(test_data_path+'/'+subdir): - os.makedirs(test_data_path+'/'+subdir) +for subdir in ['traj', 'plots', 'vispdb', 'pca', 'tica', 'clusters', 'results']: + if not os.path.exists(test_data_path + '/' + subdir): + os.makedirs(test_data_path + '/' + subdir) -### CLASS WITH ALL TEST FUNCTIONS ### +# ** CLASS WITH ALL TEST FUNCTIONS ** # class Test_pensa(unittest.TestCase): - # ** EXAMPLE WORKFLOW WITH REUSABLE DATA ** - + # EXAMPLE WORKFLOW WITH REUSABLE DATA + def setUp(self): # - PREPROCESSING - + # Set root directory for each simulation - root_dir_a = test_data_path+'/MOR-apo' - root_dir_b = test_data_path+'/MOR-BU72' + root_dir_a = test_data_path + '/MOR-apo' + root_dir_b = test_data_path + '/MOR-BU72' # Simulation A - self.ref_file_a = root_dir_a+'/mor-apo.psf' - self.pdb_file_a = root_dir_a+'/mor-apo.pdb' - self.trj_file_a = [root_dir_a+'/mor-apo-1.xtc', - root_dir_a+'/mor-apo-2.xtc', - root_dir_a+'/mor-apo-3.xtc'] + self.ref_file_a = root_dir_a + '/mor-apo.psf' + self.pdb_file_a = root_dir_a + '/mor-apo.pdb' + self.trj_file_a = [root_dir_a + '/mor-apo-1.xtc', + root_dir_a + '/mor-apo-2.xtc', + root_dir_a + '/mor-apo-3.xtc'] # Simulation B - self.ref_file_b = root_dir_b+'/mor-bu72.psf' - self.pdb_file_b = root_dir_b+'/mor-bu72.pdb' - self.trj_file_b = [root_dir_b+'/mor-bu72-1.xtc', - root_dir_b+'/mor-bu72-2.xtc', - root_dir_b+'/mor-bu72-3.xtc'] - + self.ref_file_b = root_dir_b + '/mor-bu72.psf' + self.pdb_file_b = root_dir_b + '/mor-bu72.pdb' + self.trj_file_b = [root_dir_b + '/mor-bu72-1.xtc', + root_dir_b + '/mor-bu72-2.xtc', + root_dir_b + '/mor-bu72-3.xtc'] + # Names of the output files self.trj_name_a = test_data_path + "/traj/condition-a" self.trj_name_b = test_data_path + "/traj/condition-b" # First test case: entire receptor. - + # Base for the selection string for each simulation sel_base_a = "(not name H*) and protein" sel_base_b = "(not name H*) and protein" - + # Extract the coordinates of the entire receptors self.file_extr_a = extract_coordinates( - self.ref_file_a, self.pdb_file_a, self.trj_file_a, - self.trj_name_a+"_receptor", sel_base_a - ) + self.ref_file_a, self.pdb_file_a, self.trj_file_a, + self.trj_name_a + "_receptor", sel_base_a + ) self.file_extr_b = extract_coordinates( - self.ref_file_b, self.pdb_file_b, self.trj_file_b, - self.trj_name_b+"_receptor", sel_base_b - ) + self.ref_file_b, self.pdb_file_b, self.trj_file_b, + self.trj_name_b + "_receptor", sel_base_b + ) # Second test case: only transmembrane (TM) region. - + # Generate selection strings from the file sel_string_a = load_selection( - test_data_path + "/mor_tm.txt", sel_base_a+" and " - ) + test_data_path + "/mor_tm.txt", sel_base_a + " and " + ) sel_string_b = load_selection( - test_data_path + "/mor_tm.txt", sel_base_b+" and " - ) - + test_data_path + "/mor_tm.txt", sel_base_b + " and " + ) + # Extract the coordinates of the TM region from the trajectory self.file_tm_single_a = extract_coordinates( - self.ref_file_a, self.pdb_file_a, [self.trj_file_a], - self.trj_name_a+"_tm", sel_string_a - ) + self.ref_file_a, self.pdb_file_a, [self.trj_file_a], + self.trj_name_a + "_tm", sel_string_a + ) self.file_tm_single_b = extract_coordinates( - self.ref_file_b, self.pdb_file_b, [self.trj_file_b], - self.trj_name_b+"_tm", sel_string_b - ) + self.ref_file_b, self.pdb_file_b, [self.trj_file_b], + self.trj_name_b + "_tm", sel_string_b + ) self.file_tm_combined = extract_coordinates_combined( - [self.ref_file_a]*3 + [self.ref_file_b]*3, + [self.ref_file_a] * 3 + [self.ref_file_b] * 3, self.trj_file_a + self.trj_file_b, - [sel_string_a]*3 + [sel_string_b]*3, + [sel_string_a] * 3 + [sel_string_b] * 3, test_data_path + '/traj/combined_tm', start_frame=0 - ) - + ) # - FEATURES - - + self.start_frame = 0 - + # -- Receptor features -- sim_a_rec = get_structure_features( test_data_path + "/traj/condition-a_receptor.gro", test_data_path + "/traj/condition-a_receptor.xtc", - start_frame = self.start_frame - ) + start_frame=self.start_frame + ) sim_b_rec = get_structure_features( test_data_path + "/traj/condition-b_receptor.gro", test_data_path + "/traj/condition-b_receptor.xtc", - start_frame = self.start_frame - ) + start_frame=self.start_frame + ) self.sim_a_rec_feat, self.sim_a_rec_data = sim_a_rec self.sim_b_rec_feat, self.sim_b_rec_data = sim_b_rec @@ -114,69 +131,66 @@ def setUp(self): sim_a_tmr = get_structure_features( test_data_path + "/traj/condition-a_tm.gro", test_data_path + "/traj/condition-a_tm.xtc", - start_frame = self.start_frame - ) + start_frame=self.start_frame + ) sim_b_tmr = get_structure_features( test_data_path + "/traj/condition-b_tm.gro", test_data_path + "/traj/condition-b_tm.xtc", - start_frame = self.start_frame - ) + start_frame=self.start_frame + ) self.sim_a_tmr_feat, self.sim_a_tmr_data = sim_a_tmr self.sim_b_tmr_feat, self.sim_b_tmr_data = sim_b_tmr - - + # -- Discrete States -- - + # --- Multivariate Features out_name_a = "condition-a" out_name_b = "condition-b" self.sim_a_rec_multivar_feat, self.sim_a_rec_multivar_data = get_multivar_res_timeseries( self.sim_a_rec_feat, self.sim_a_rec_data, 'sc-torsions', write=True, out_name=out_name_a - ) + ) self.sim_b_rec_multivar_feat, self.sim_b_rec_multivar_data = get_multivar_res_timeseries( self.sim_b_rec_feat, self.sim_b_rec_data, 'sc-torsions', write=True, out_name=out_name_b - ) - - # --- Gaussian Discretization + ) + + # --- Gaussian Discretization self.discrete_states_ab = get_discrete_states( self.sim_a_rec_multivar_data['sc-torsions'], self.sim_b_rec_multivar_data['sc-torsions'], discretize='gaussian', pbc=True - ) - + ) # -- Relative Entropy -- - + # --- BB Torsions self.relen_bbtor = relative_entropy_analysis( self.sim_a_rec_feat['bb-torsions'], self.sim_b_rec_feat['bb-torsions'], self.sim_a_rec_data['bb-torsions'], self.sim_b_rec_data['bb-torsions'], bin_num=10, verbose=False - ) + ) self.names_bbtors, self.jsd_bbtors, self.kld_ab_bbtors, kld_ba_bbtors = self.relen_bbtor - + # --- SC Torsions self.relen_sctor = relative_entropy_analysis( self.sim_a_rec_feat['sc-torsions'], self.sim_b_rec_feat['sc-torsions'], self.sim_a_rec_data['sc-torsions'], self.sim_b_rec_data['sc-torsions'], bin_num=10, verbose=False - ) + ) self.names_sctors, self.jsd_sctors, self.kld_ab_sctors, kld_ba_sctors = self.relen_sctor - + # --- Distance self.relen_dist = relative_entropy_analysis( self.sim_a_rec_feat['bb-distances'], self.sim_b_rec_feat['bb-distances'], self.sim_a_rec_data['bb-distances'], self.sim_b_rec_data['bb-distances'], bin_num=10, verbose=False - ) + ) self.names_bbdist, self.jsd_bbdist, self.kld_ab_bbdist, self.kld_ba_bbdist = self.relen_dist - # - PCA AND TICA - - + bbt_a = self.sim_a_tmr_data['bb-torsions'] bbt_b = self.sim_b_tmr_data['bb-torsions'] - combined_data_tors = np.concatenate([bbt_a, bbt_b],0) - + combined_data_tors = np.concatenate([bbt_a, bbt_b], 0) + self.pca_combined = calculate_pca(combined_data_tors) self.tica_bbt_a = calculate_tica(bbt_a) self.tica_bbt_b = calculate_tica(bbt_b) @@ -185,7 +199,7 @@ def setUp(self): self.graph, self.corr = pca_features( self.pca_combined, self.sim_a_tmr_feat['bb-torsions'], combined_data_tors, 3, 0.4, plot_file=test_data_path + "/plots/pca-features_bbtors_a.pdf" - ) + ) plt.close() # -- Sort trajectory pc @@ -195,9 +209,9 @@ def setUp(self): self.sim_a_tmr_data['bb-torsions'], test_data_path + "/traj/condition-a_receptor.gro", test_data_path + "/traj/condition-a_receptor.xtc", - test_data_path + "/pca/condition-a_receptor_by_tmr", + test_data_path + "/pca/condition-a_receptor_by_tmr", pca=pca_a, num_pc=3 - ) + ) # -- Compare projections self.val = compare_projections( @@ -205,112 +219,95 @@ def setUp(self): self.sim_b_tmr_data['bb-torsions'], self.pca_combined, label_a='A', label_b='B' - ) + ) plt.close() - - + # - CLUSTERING - - + self.cidx, self.cond, self.oidx, self.wss, self.centroids = obtain_combined_clusters( - self.sim_a_tmr_data['bb-torsions'],self.sim_b_tmr_data['bb-torsions'], + self.sim_a_tmr_data['bb-torsions'], self.sim_b_tmr_data['bb-torsions'], label_a='A', label_b='B', start_frame=0, algorithm='kmeans', max_iter=100, num_clusters=3, min_dist=12, saveas=test_data_path + '/plots/combined_clust_bbtors.pdf' - ) + ) # -- Write clusters as trajectory name = "condition-a_tm" self.atom_group = write_cluster_traj( - self.cidx[self.cond==0], - test_data_path + "/traj/"+name+".gro",test_data_path + "/traj/"+name+".xtc", - test_data_path + "/clusters/"+"combined_clust_bbtors_"+name, - self.start_frame - ) - + self.cidx[self.cond == 0], + test_data_path + "/traj/" + name + ".gro", test_data_path + "/traj/" + name + ".xtc", + test_data_path + "/clusters/" + "combined_clust_bbtors_" + name, + self.start_frame + ) # ** COORDINATES AND FEATURES ** - # -- extract_coordinates() and extract_coordinates_combined() def test_01_extract_coordinates(self): - # Number of atoms from selection in first test case self.assertEqual(self.file_extr_a, 2322) self.assertEqual(self.file_extr_b, 2322) - # Number of Atom from the selction self.assertEqual(self.file_tm_single_a, 1877) self.assertEqual(self.file_tm_single_b, 1877) # Number of Atom from the selection - combined self.assertEqual(self.file_tm_combined, 1877) - # -- load_selection() def test_02_load_selection(self): - sel_base_a = "(not name H*) and protein" sel_base_b = "(not name H*) and protein" - - sel_string_a = load_selection(test_data_path + "/mor_tm.txt", sel_base_a+" and ") - sel_string_b = load_selection(test_data_path + "/mor_tm.txt", sel_base_b+" and ") - + sel_string_a = load_selection(test_data_path + "/mor_tm.txt", sel_base_a + " and ") + sel_string_b = load_selection(test_data_path + "/mor_tm.txt", sel_base_b + " and ") self.assertEqual(sel_string_a, '(not name H*) and protein and resid 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 ') self.assertEqual(sel_string_b, '(not name H*) and protein and resid 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 ') - # -- get_features() def test_03_get_feature(self): - + # Simulation A, full receptor self.assertEqual(len(self.sim_a_rec_feat['bb-torsions']), 574) self.assertEqual(len(self.sim_a_rec_feat['sc-torsions']), 527) self.assertEqual(self.sim_a_rec_data['bb-torsions'].shape, (30, 574)) self.assertEqual(self.sim_a_rec_data['sc-torsions'].shape, (30, 527)) - + # Simulation B, full receptor self.assertEqual(len(self.sim_b_rec_feat['bb-torsions']), 574) self.assertEqual(len(self.sim_b_rec_feat['sc-torsions']), 527) self.assertEqual(self.sim_b_rec_data['bb-torsions'].shape, (30, 574)) self.assertEqual(self.sim_b_rec_data['sc-torsions'].shape, (30, 527)) - + # Simulation A, transmembrane region self.assertEqual(len(self.sim_a_tmr_feat['bb-torsions']), 448) self.assertEqual(len(self.sim_a_tmr_feat['sc-torsions']), 423) self.assertEqual(self.sim_a_tmr_data['bb-torsions'].shape, (30, 448)) self.assertEqual(self.sim_a_tmr_data['sc-torsions'].shape, (30, 423)) - + # Simulation B, transmembrane region self.assertEqual(len(self.sim_b_tmr_feat['bb-torsions']), 448) self.assertEqual(len(self.sim_b_tmr_feat['sc-torsions']), 423) self.assertEqual(self.sim_b_tmr_data['bb-torsions'].shape, (30, 448)) self.assertEqual(self.sim_b_tmr_data['sc-torsions'].shape, (30, 423)) - # ** ENSEMBLE COMPARISON ** - - + # -- relative_entropy_analysis() def test_04_relative_entropy_analysis(self): - self.assertEqual(len(self.relen_bbtor[1]), 574) self.assertEqual(len(self.relen_bbtor[0]), 574) - - + # -- relen_sem_analysis() def test_05_relen_sem_analysis(self): - # --- Uncertainty for torsions self.relen_bbtor_blocks = relen_block_analysis( self.sim_a_rec_feat['bb-torsions'], self.sim_b_rec_feat['bb-torsions'], self.sim_a_rec_data['bb-torsions'], self.sim_b_rec_data['bb-torsions'], blockanlen=10, cumdist=False, verbose=True - ) + ) self.relen_bbtor_sem = relen_sem_analysis(self.relen_bbtor_blocks, write_plot=False) - # --- Uncertainty for distances self.relen_dist_blocks = relen_block_analysis( self.sim_a_rec_feat['bb-distances'], self.sim_b_rec_feat['bb-distances'], self.sim_a_rec_data['bb-distances'], self.sim_b_rec_data['bb-distances'], blockanlen=10, cumdist=False, verbose=True - ) + ) self.relen_dist_sem = relen_sem_analysis(self.relen_dist_blocks, write_plot=False) - print('SEM FOR TORSIONS') resrelenvals, avresrelenvals, avsemvals = self.relen_bbtor_sem print('resrelenvals:') @@ -319,7 +316,6 @@ def test_05_relen_sem_analysis(self): print(avresrelenvals) print('avsemvals:') print(avsemvals) - resrelenvals, avresrelenvals, avsemvals = self.relen_dist_sem print('SEM FOR DISTANCES') print('resrelenvals:') @@ -329,64 +325,56 @@ def test_05_relen_sem_analysis(self): print('avsemvals:') print(avsemvals) - # -- ssi_ensemble_analysis() def test_06_ssi_ensemble_analysis(self): - # --- Torsions ssi_sctor = ssi_ensemble_analysis( self.sim_a_rec_multivar_feat['sc-torsions'], self.sim_b_rec_multivar_feat['sc-torsions'], self.sim_a_rec_multivar_data['sc-torsions'], self.sim_b_rec_multivar_data['sc-torsions'], self.discrete_states_ab, verbose=False - ) + ) names_sctors, ssi_sctors = ssi_sctor - # -- ssi_sem_analysis() def test_07_ssi_sem_analysis(self): - # --- Uncertainty for torsions ssi_names, ssi_sctor_blocks = ssi_block_analysis( self.sim_a_rec_feat['sc-torsions'], self.sim_b_rec_feat['sc-torsions'], self.sim_a_rec_data['sc-torsions'], self.sim_b_rec_data['sc-torsions'], blockanlen=100, pbc=True, discretize='gaussian', group_feat=True, cumdist=False, verbose=True - ) + ) ssi_sctor_sem = ssi_sem_analysis( ssi_names, ssi_sctor_blocks, write_plot=False - ) - + ) # -- sort_features() def test_08_sort_features(self): sf = sort_features(self.names_bbtors, self.jsd_bbtors) self.assertEqual(len(sf), 574) - # -- residue_visualization() def test_09_residue_visualization(self): ref_filename = test_data_path + "/traj/condition-a_receptor.gro" out_filename = "receptor_bbtors-deviations_tremd" vis = residue_visualization( self.names_bbtors, self.jsd_bbtors, ref_filename, - test_data_path + "/plots/"+out_filename+"_jsd.pdf", - test_data_path + "/vispdb/"+out_filename+"_jsd.pdb", + test_data_path + "/plots/" + out_filename + "_jsd.pdf", + test_data_path + "/vispdb/" + out_filename + "_jsd.pdb", y_label='max. JS dist. of BB torsions' - ) - + ) self.assertEqual(len(vis), 2) self.assertEqual(len(vis[0]), 288) self.assertEqual(len(vis[1]), 288) plt.close() del vis - # -- distances_visualization() def test_10_distances_visualization(self): matrix = distances_visualization( self.names_bbdist, self.jsd_bbdist, test_data_path + "/plots/receptor_jsd-bbdist.pdf", - vmin = 0.0, vmax = 1.0 - ) + vmin=0.0, vmax=1.0 + ) self.assertEqual(len(matrix), 288) for i in range(len(matrix)): self.assertEqual(len(matrix[i]), 288) @@ -394,46 +382,39 @@ def test_10_distances_visualization(self): plt.close() del matrix - - # ** DIMENSIONALITY ** - # -- calculate_pca() def test_11_calculate_pca(self): self.assertEqual(len(self.pca_combined.mean_), 448) self.assertEqual(self.pca_combined.get_covariance().shape[0], 448) - # -- calculate_tica def test_12_calculate_tica(self): self.assertEqual(self.tica_bbt_a.koopman_matrix.size, 841) self.assertEqual(self.tica_bbt_b.koopman_matrix.size, 841) - # -- pca_eigenvalues_plot() def test_13_pca_eigenvalues_plot(self): arr = pca_eigenvalues_plot( - self.pca_combined, num=12, - plot_file=test_data_path+'/plots/combined_tmr_pca_ev.pdf' - ) + self.pca_combined, num=12, + plot_file=test_data_path + '/plots/combined_tmr_pca_ev.pdf' + ) self.assertEqual(len(arr[0]), 12) self.assertEqual(len(arr[1]), 12) plt.close() del arr - # -- tica_eigenvalues_plot() def test_14_tica_eigenvalues_plot(self): arr_1, arr_2 = tica_eigenvalues_plot( - self.tica_bbt_a, num=12, - plot_file=test_data_path+'/plots/combined_tmr_tica_bbt_a_ev.pdf' - ) + self.tica_bbt_a, num=12, + plot_file=test_data_path + '/plots/combined_tmr_tica_bbt_a_ev.pdf' + ) self.assertEqual(len(arr_1), 12) self.assertEqual(len(arr_2), 12) - - #-- pca_features() + # -- pca_features() def test_15_pca_features(self): self.assertEqual(len(self.graph), 3) plt.close() @@ -443,27 +424,25 @@ def test_15_pca_features(self): # -- Corr self.assertEqual(len(self.corr), 48) - # -- tica_features() def test_16_tica_features(self): test_feature = tica_features( self.tica_bbt_a, self.sim_a_tmr_feat['bb-torsions'], 3, 0.4 - ) + ) self.assertEqual(len(test_feature), 448) - # -- sort_trajs_along_common_pc() + sort_traj_along_pc() + project_on_pc() def test_17_sort_trajs_along_pc(self): sort_common_traj = sort_trajs_along_common_pc( - self.sim_a_tmr_data['bb-torsions'], + self.sim_a_tmr_data['bb-torsions'], self.sim_b_tmr_data['bb-torsions'], test_data_path + "/traj/condition-a_receptor.gro", - test_data_path + "/traj/condition-b_receptor.gro", + test_data_path + "/traj/condition-b_receptor.gro", test_data_path + "/traj/condition-a_receptor.xtc", test_data_path + "/traj/condition-b_receptor.xtc", test_data_path + "/pca/receptor_by_tmr", - num_pc=3, start_frame = self.start_frame - ) + num_pc=3, start_frame=self.start_frame + ) plt.close() for ele in sort_common_traj: self.assertEqual(len(ele), 3) @@ -476,36 +455,32 @@ def test_18_sort_trajs_along_common_tic(self): self.sim_b_tmr_data['bb-torsions'], test_data_path + "/traj/condition-a_receptor.gro", test_data_path + "/traj/condition-b_receptor.gro", - test_data_path + "/traj/condition-a_receptor.xtc", + test_data_path + "/traj/condition-a_receptor.xtc", test_data_path + "/traj/condition-b_receptor.xtc", test_data_path + "/tica/receptor_by_tmr", num_ic=3 - ) + ) self.assertEqual(len(sproj[0]), 60) self.assertEqual(len(sidx_data[0]), 60) - # -- sort_traj_along_tic() def test_19_sort_traj_along_tic(self): all_sort, _, _ = sort_traj_along_tic( - self.sim_a_tmr_data['bb-torsions'], + self.sim_a_tmr_data['bb-torsions'], test_data_path + "/traj/condition-a_receptor.gro", test_data_path + "/traj/condition-a_receptor.xtc", - test_data_path + "/pca/condition-a_receptor_by_tmr", - tica = self.tica_bbt_a, + test_data_path + "/pca/condition-a_receptor_by_tmr", + tica=self.tica_bbt_a, num_ic=3 - ) + ) self.assertEqual(len(all_sort), 3) - # -- compare_projections() def test_20_compare_projections(self): - self.assertEqual(len(self.val), 3) self.assertEqual(len(self.val[0]), 2) self.assertEqual(len(self.val[1]), 2) self.assertEqual(len(self.val[2]), 2) - self.assertEqual(len(self.val[0][0]), 30) self.assertEqual(len(self.val[0][1]), 30) self.assertEqual(len(self.val[1][0]), 30) @@ -513,22 +488,18 @@ def test_20_compare_projections(self): self.assertEqual(len(self.val[2][0]), 30) self.assertEqual(len(self.val[2][1]), 30) - - - # ** CLUSTERING ** - # -- obtain_combined_clusters() def test_21_obtain_combined_clusters(self): self.assertEqual(len(self.cidx), 60) self.assertEqual(len(self.cond), 60) - test_oidx = [ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, - 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 0, 1, 2, 3, - 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + test_oidx = [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 0, 1, 2, 3, + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 - ] + ] self.assertEqual(len(self.centroids), 3) self.assertEqual(len(self.centroids[0]), 448) self.assertEqual(len(self.centroids[1]), 448) @@ -536,34 +507,30 @@ def test_21_obtain_combined_clusters(self): for i in range(len(self.oidx)): self.assertEqual(self.oidx[i], test_oidx[i]) - # -- write_cluster_traj() def test_22_write_cluster_traj(self): self.assertEqual(len(self.atom_group), 30) for i in range(len(self.atom_group)): self.assertEqual(self.atom_group[i].n_atoms, 1877) - # -- wss_over_number_of_combined_clusters() def test_23_wss_over_number_of_combined_clusters(self): - # -- wss over number of combined wss_combined_avg, wss_combined_std = wss_over_number_of_combined_clusters( self.sim_a_tmr_data['bb-torsions'], self.sim_b_tmr_data['bb-torsions'], label_a='A', label_b='B', start_frame=0, - algorithm='kmeans', max_num_clusters = 12, - max_iter=100, num_repeats = 5, - plot_file = None - ) + algorithm='kmeans', max_num_clusters=12, + max_iter=100, num_repeats=5, + plot_file=None + ) plt.close() - # -- wss over number of clusters wss_avg, wss_std = wss_over_number_of_clusters( self.sim_a_tmr_data['bb-torsions'], - algorithm='kmeans', max_num_clusters = 12, - max_iter=100, num_repeats = 5, - plot_file = None - ) + algorithm='kmeans', max_num_clusters=12, + max_iter=100, num_repeats=5, + plot_file=None + ) plt.close() self.assertEqual(len(wss_avg), 11) self.assertEqual(len(wss_std), 11) @@ -572,11 +539,10 @@ def test_23_wss_over_number_of_combined_clusters(self): self.assertEqual(len(wss_combined_std), 11) self.assertLess(wss_combined_std[0], 1.0e-12) - # -- obtain_clusters() def test_24_obtain_clusters(self): # -- obtain combined clusters - _ci, _wss, _centroids = obtain_clusters(self.sim_a_tmr_data['bb-torsions'], num_clusters=5 ) + _ci, _wss, _centroids = obtain_clusters(self.sim_a_tmr_data['bb-torsions'], num_clusters=5) plt.close() unq_ci = list(set(_ci)) for i in range(len(unq_ci)): @@ -587,10 +553,6 @@ def test_24_obtain_clusters(self): self.assertEqual(len(_centroids[i]), 448) - -### RUN THE TESTS ### - +# ** RUN THE TESTS ** # unittest.main(argv=['ignored', '-v'], exit=False) - gc.collect() - diff --git a/tutorial/PENSA_Tutorial_SSI.py b/tutorial/PENSA_Tutorial_SSI.py index 60b4e21..b1eae5b 100755 --- a/tutorial/PENSA_Tutorial_SSI.py +++ b/tutorial/PENSA_Tutorial_SSI.py @@ -13,14 +13,14 @@ # # # # Define where to save the GPCRmd files root_dir = './mor-data' # # Define which files to download -md_files = ['11427_dyn_151.psf','11426_dyn_151.pdb', # MOR-apo - '11423_trj_151.xtc','11424_trj_151.xtc','11425_trj_151.xtc', - '11580_dyn_169.psf','11579_dyn_169.pdb', # MOR-BU72 - '11576_trj_169.xtc','11577_trj_169.xtc','11578_trj_169.xtc'] +md_files = ['11427_dyn_151.psf', '11426_dyn_151.pdb', # MOR-apo + '11423_trj_151.xtc', '11424_trj_151.xtc', '11425_trj_151.xtc', + '11580_dyn_169.psf', '11579_dyn_169.pdb', # MOR-BU72 + '11576_trj_169.xtc', '11577_trj_169.xtc', '11578_trj_169.xtc'] # # Download all the files that do not exist yet for file in md_files: - if not os.path.exists(os.path.join(root_dir,file)): - download_from_gpcrmd(file,root_dir) + if not os.path.exists(os.path.join(root_dir, file)): + download_from_gpcrmd(file, root_dir) root_dir = './mor-data' # # # Simulation A @@ -42,23 +42,23 @@ out_name_a = "traj/condition-a" out_name_b = "traj/condition-b" -for subdir in ['traj','plots','vispdb','pca','clusters','results']: +for subdir in ['traj', 'plots', 'vispdb', 'pca', 'clusters', 'results']: if not os.path.exists(subdir): os.makedirs(subdir) # Extract the coordinates of the receptor from the trajectory extract_coordinates(ref_file_a, pdb_file_a, trj_file_a, out_name_a+"_receptor", sel_base_a) extract_coordinates(ref_file_b, pdb_file_b, trj_file_b, out_name_b+"_receptor", sel_base_b) - + # # # Extract the features from the beginning (start_frame) of the trajectory -start_frame=0 +start_frame=0 a_rec = get_structure_features(out_name_a+"_receptor.gro", - out_name_a+"_receptor.xtc", + out_name_a+"_receptor.xtc", start_frame) a_rec_feat, a_rec_data = a_rec b_rec = get_structure_features(out_name_b+"_receptor.gro", - out_name_b+"_receptor.xtc", + out_name_b+"_receptor.xtc", start_frame) b_rec_feat, b_rec_data = b_rec @@ -67,20 +67,20 @@ out_name_b = "condition-b" -# # Extract the multivariate torsion coordinates of each residue as a -# # timeseries from the trajectory and write into subdirectory -# # output = [[torsion 1 timeseries],[torsion 2 timeseries],...,[torsion n timeseries]] -sc_multivar_res_feat_a, sc_multivar_res_data_a = get_multivar_res_timeseries(a_rec_feat,a_rec_data,'sc-torsions',write=True,out_name=out_name_a) -sc_multivar_res_feat_b, sc_multivar_res_data_b = get_multivar_res_timeseries(b_rec_feat,b_rec_data,'sc-torsions',write=True,out_name=out_name_b) +# # Extract the multivariate torsion coordinates of each residue as a +# # timeseries from the trajectory and write into subdirectory +# # output = [[torsion 1 timeseries], [torsion 2 timeseries], ..., [torsion n timeseries]] +sc_multivar_res_feat_a, sc_multivar_res_data_a = get_multivar_res_timeseries(a_rec_feat, a_rec_data, 'sc-torsions', write=True, out_name=out_name_a) +sc_multivar_res_feat_b, sc_multivar_res_data_b = get_multivar_res_timeseries(b_rec_feat, b_rec_data, 'sc-torsions', write=True, out_name=out_name_b) discrete_states_ab = get_discrete_states(sc_multivar_res_data_a['sc-torsions'], sc_multivar_res_data_b['sc-torsions'], discretize='gaussian', pbc=True) -# # # We can calculate the State Specific Information (SSI) shared between the -# # # ensemble switch and the combined ensemble residue conformations. As the ensemble -# # # is a binary change, SSI can exist within the range [0,1] units=bits. -# # # 0 bits = no information, 1 bits = maximum information, i.e. you can predict the state of the ensemble with +# # # We can calculate the State Specific Information (SSI) shared between the +# # # ensemble switch and the combined ensemble residue conformations. As the ensemble +# # # is a binary change, SSI can exist within the range [0, 1] units=bits. +# # # 0 bits = no information, 1 bits = maximum information, i.e. you can predict the state of the ensemble with # # # certainty from the state of the residue. # # # Set write_plots = True to generate a folder with all the clustered states for each residue. data_names, data_ssi = ssi_ensemble_analysis(sc_multivar_res_feat_a['sc-torsions'], sc_multivar_res_feat_b['sc-torsions'], @@ -88,11 +88,11 @@ discrete_states_ab, verbose=True, write_plots=False) -# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # +# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # -# # # # Featurizing waters and calculating SSI +# # # # Featurizing waters and calculating SSI -# # # # First we preprocess the trajectories to extract coordinates for protein +# # # # First we preprocess the trajectories to extract coordinates for protein # # # # and waters. root_dir = './mor-data' # Simulation A @@ -114,7 +114,7 @@ out_name_a = "traj/cond-a_water" out_name_b = "traj/cond-b_water" -for subdir in ['traj','plots','vispdb','pca','clusters','results']: +for subdir in ['traj', 'plots', 'vispdb', 'pca', 'clusters', 'results']: if not os.path.exists(subdir): os.makedirs(subdir) @@ -122,12 +122,12 @@ extract_coordinates(ref_file_a, pdb_file_a, trj_file_a, out_name_a, sel_base_a) extract_coordinates(ref_file_b, pdb_file_b, trj_file_b, out_name_b, sel_base_b) -# # # Extract the coordinates of the ensemble a aligned to ensemble b -extract_aligned_coords(out_name_a+".gro", out_name_a+".xtc", +# # # Extract the coordinates of the ensemble a aligned to ensemble b +extract_aligned_coords(out_name_a+".gro", out_name_a+".xtc", out_name_b+".gro", out_name_b+".xtc") -# # Extract the combined density of the waters in both ensembles a and b -extract_combined_grid(out_name_a+".gro", "dens/cond-a_wateraligned.xtc", +# # Extract the combined density of the waters in both ensembles a and b +extract_combined_grid(out_name_a+".gro", "dens/cond-a_wateraligned.xtc", out_name_b+".gro", out_name_b+".xtc", atomgroup="OH2", write_grid_as="TIP3P", @@ -137,8 +137,8 @@ grid_combined = "dens/ab_grid_OH2_density.dx" # # Then we featurize the waters common to both simulations -# # We can do the same analysis for ions using the get_atom_features featurizer. -water_feat_a, water_data_a = get_water_features(structure_input = out_name_a+".gro", +# # We can do the same analysis for ions using the get_atom_features featurizer. +water_feat_a, water_data_a = get_water_features(structure_input = out_name_a+".gro", xtc_input = "dens/cond-a_wateraligned.xtc", top_waters = 2, atomgroup = "OH2", @@ -146,7 +146,7 @@ write = True, out_name = "cond_a") -water_feat_b, water_data_b = get_water_features(structure_input = out_name_b+".gro", +water_feat_b, water_data_b = get_water_features(structure_input = out_name_b+".gro", xtc_input = out_name_b+".xtc", top_waters = 2, atomgroup = "OH2", @@ -162,8 +162,8 @@ discretize='gaussian', pbc=True) # # # SSI shared between waters and the switch between ensemble conditions -data_names, data_ssi = ssi_ensemble_analysis(water_feat_a['WaterPocket_Distr'],water_feat_b['WaterPocket_Distr'], - water_data_a['WaterPocket_Distr'],water_data_b['WaterPocket_Distr'], +data_names, data_ssi = ssi_ensemble_analysis(water_feat_a['WaterPocket_Distr'], water_feat_b['WaterPocket_Distr'], + water_data_a['WaterPocket_Distr'], water_data_b['WaterPocket_Distr'], discrete_states_ab1, verbose=True) @@ -175,30 +175,30 @@ discretize='partition_values', pbc=False) -data_names, data_ssi = ssi_ensemble_analysis(water_feat_a['WaterPocket_OccupDistr'],water_feat_b['WaterPocket_OccupDistr'], - water_data_a['WaterPocket_OccupDistr'],water_data_b['WaterPocket_OccupDistr'], +data_names, data_ssi = ssi_ensemble_analysis(water_feat_a['WaterPocket_OccupDistr'], water_feat_b['WaterPocket_OccupDistr'], + water_data_a['WaterPocket_OccupDistr'], water_data_b['WaterPocket_OccupDistr'], discrete_states_ab2, pbc=False, verbose=True) # # # In this example we can see that the state of water 01 shares ~0.25 bits of -# # # information with the ensembles, but the occupancy of water 1 pocket shares ~0.07 bits, +# # # information with the ensembles, but the occupancy of water 1 pocket shares ~0.07 bits, # # # revealing that the polarisation of this water is more functionally important with respect # # # to what these ensembles are investigating. -# # # We can calculate the State Specific Information (SSI) shared between the +# # # We can calculate the State Specific Information (SSI) shared between the # # # water pockets across both ensembles. This quantity is no longer bound by 1 bit. -data_names, data_ssi = ssi_feature_analysis(water_feat_a['WaterPocket_Distr'],water_feat_b['WaterPocket_Distr'], - water_data_a['WaterPocket_Distr'],water_data_b['WaterPocket_Distr'], +data_names, data_ssi = ssi_feature_analysis(water_feat_a['WaterPocket_Distr'], water_feat_b['WaterPocket_Distr'], + water_data_a['WaterPocket_Distr'], water_data_b['WaterPocket_Distr'], discrete_states_ab1, verbose=True) # # # The ssi_feature_analysis() tells us that these two water pockets are conformationally to some extent. # # # An equivalent interpretation of co-SSI is how much the switch between ensembles -# # # is involved in strengthening (positive co-SSI) / weakening (negative co-SSI) the conformational coupling between two features. -feat_names, data_ssi, data_cossi = cossi_featens_analysis(water_feat_a['WaterPocket_Distr'],water_feat_b['WaterPocket_Distr'], - water_feat_a['WaterPocket_Distr'],water_feat_b['WaterPocket_Distr'], - water_data_a['WaterPocket_Distr'],water_data_b['WaterPocket_Distr'], - water_data_a['WaterPocket_Distr'],water_data_b['WaterPocket_Distr'], +# # # is involved in strengthening (positive co-SSI) / weakening (negative co-SSI) the conformational coupling between two features. +feat_names, data_ssi, data_cossi = cossi_featens_analysis(water_feat_a['WaterPocket_Distr'], water_feat_b['WaterPocket_Distr'], + water_feat_a['WaterPocket_Distr'], water_feat_b['WaterPocket_Distr'], + water_data_a['WaterPocket_Distr'], water_data_b['WaterPocket_Distr'], + water_data_a['WaterPocket_Distr'], water_data_b['WaterPocket_Distr'], discrete_states_ab1, discrete_states_ab1, verbose=True) diff --git a/tutorial/PENSA_Tutorial_density_featurizer.py b/tutorial/PENSA_Tutorial_density_featurizer.py index 4533ce0..957c0ca 100755 --- a/tutorial/PENSA_Tutorial_density_featurizer.py +++ b/tutorial/PENSA_Tutorial_density_featurizer.py @@ -16,18 +16,18 @@ """ # # # Get the water pocket data for the top 3 most probable sites (top_waters = 3). -# # # Orientation of the waters (spherical coordinates [radians]) is a timeseries distribution. +# # # Orientation of the waters (spherical coordinates [radians]) is a timeseries distribution. # # # When water is not present at the site, the orientation is recorded as 10000.0 to represent an empty state. # # # Visualise the pocket occupancies on the reference structure in a pdb file (write=True) with occupation frequencies # # # saved as b_factors. If write=True, you must specify the water model for writing out the grid. # # # options include: -# SPC +# SPC # TIP3P -# TIP4P -# water +# TIP4P +# water struc = "mor-data/11426_dyn_151.pdb" xtc = "mor-data/11423_trj_151.xtc" -water_feat, water_data = get_water_features(structure_input = struc, +water_feat, water_data = get_water_features(structure_input = struc, xtc_input = xtc, top_waters = 1, atomgroup = "OH2", @@ -41,7 +41,7 @@ xtc = "mor-data/11423_trj_151.xtc" # # # Here we locate the sodium site which has the highest probability # # # The density grid is written (write=True) using the default density conversion "Angstrom^{-3}" in MDAnalysis -atom_feat, atom_data = get_atom_features(structure_input = struc, +atom_feat, atom_data = get_atom_features(structure_input = struc, xtc_input = xtc, top_atoms = 1, atomgroup = "SOD", @@ -54,7 +54,7 @@ struc = "mor-data/11426_dyn_151.pdb" xtc = "mor-data/11423_trj_151.xtc" grid = "dens/11426_dyn_151OH2_density.dx" -water_feat, water_data = get_water_features(structure_input = struc, +water_feat, water_data = get_water_features(structure_input = struc, xtc_input = xtc, top_waters = 5, atomgroup = "OH2", diff --git a/tutorial/PENSA_Tutorial_hbond_featurizer.py b/tutorial/PENSA_Tutorial_hbond_featurizer.py index 02a3681..be0ce82 100644 --- a/tutorial/PENSA_Tutorial_hbond_featurizer.py +++ b/tutorial/PENSA_Tutorial_hbond_featurizer.py @@ -6,13 +6,12 @@ @author: neil """ -import os from pensa import * -# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # +# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # -# # # # Featurizing water cavity hydrogen bonds +# # # # Featurizing water cavity hydrogen bonds root_dir = './mor-data' @@ -21,22 +20,22 @@ trj_file_a = root_dir+'/11423_trj_151.xtc' names, data = get_cavity_bonds(ref_file_a, trj_file_a, - atomgroups = ['OH2', 'H1', 'H2'], - site_IDs = [1,2], + atomgroups = ['OH2', 'H1', 'H2'], + site_IDs = [1, 2], grid_input=None, write=True, write_grid_as='TIP3P', out_name='11423_trj_169') -# # # # Featurizing ligand-protein hydrogen bonds +# # # # Featurizing ligand-protein hydrogen bonds ref_file_a = root_dir+'/11580_dyn_169.psf' pdb_file_a = root_dir+'/11579_dyn_169.pdb' trj_file_a = root_dir+'/11578_trj_169.xtc' names, data = get_h_bonds(ref_file_a, trj_file_a, fixed_group = 'resname 4VO', - dyn_group='protein', + dyn_group='protein', write=True, out_name='4VO_hbonds')