Merge branch 'main' into mglob_prior

carlocamilloni · Dec 3, 2023 · 8679153 · 8679153
2 parents e4985bd + cd328a5
commit 8679153
Show file tree

Hide file tree

Showing 19 changed files with 3,578 additions and 3,669 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -3,20 +3,20 @@ name: 'Multi-eGO test'
 on: [ push, pull_request ]
 
 jobs:
-  build-linux:
+  build-linux-conda:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - uses: actions/cache@v3
       with:
         path: ~/.ccache
         key: ccache-reset-${{ github.sha }}
         restore-keys: ccache-reset-
-    - name: Set up Python 3.8
-      uses: actions/setup-python@v3
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v4
       with:
-        python-version: '3.8'
+        python-version: '3.10'
     - name: Add conda to system path
       run: |
         # $CONDA is an environment variable pointing to the root of the miniconda directory
@@ -25,14 +25,42 @@ jobs:
       run: |
         conda env update --file conda/environment.yml --name base
         conda install flake8
+    - name: Run flake8 
+      run: |
         # stop the build if there are Python syntax errors or undefined names
         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-    # - name: Install pytest
-      # run: |
-        # conda install pytest
     - name: Run tests
-      # run: .venv/bin/pytest
       run: |
         $CONDA/bin/python test/run_tests.py
+
+  build-linux-pip:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.10'
+        cache: 'pip'
+    - name: Install requirements
+      run: pip install -r requirements.txt
+    - name: Run tests
+      run: |
+        python test/run_tests.py
+
+  build-macos-pip:
+    runs-on: macos-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python 3.9
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.9'
+        cache: 'pip'
+    - name: Install requirements
+      run: pip install -r requirements.txt
+    - name: Run tests
+      run: |
+        python test/run_tests.py
diff --git a/README.md b/README.md
@@ -21,7 +21,8 @@ Original version by Emanuele Scalone, Cristina Paissoni, and Carlo Camilloni, [C
 
 ## Installation
 Use ```conda``` and the enviroment file provided. For mac users employing an M2 CPU, we recommend using ```environment_macOS_M2.yml```.
-For all other hardware, we recommend using ```environment.yml```.
+For all other hardware, we recommend using ```environment.yml```. It is also possible to use ``pip install -r requirements.txt`` using
+Python 3.9 or later.
 
 ## Requirements
 Multi-*e*GO force-fields and tools are meant to be used with [GROMACS](https://www.gromacs.org), currently tested versions are 2021 to 2023.

diff --git a/conda/environment_macOS_M2.yml b/conda/environment_macOS_M2.yml
@@ -40,4 +40,3 @@ dependencies:
   - wheel=0.40.0=pyhd8ed1ab_0
   - xz=5.2.6=h57fd34a_0
   - zlib=1.2.13=h53f4e23_5
-prefix: /Users/cape/miniconda3/envs/multi-ego
diff --git a/mdps/ff_aa-posre.mdp b/mdps/ff_aa-posre.mdp
@@ -107,7 +107,7 @@ print-nose-hoover-chain-variables = no
 ; Groups to couple separately
 tc-grps                  = system
 ; Time constant (ps) and reference temperature (K)
-tau_t                    = 1.0 ; 50.0
+tau_t                    = 25; can be modified to improve diffusion 
 ref_t                    = _TEMP_ 
 ; pressure coupling     
 Pcoupl                   = no

diff --git a/mdps/ff_aa.mdp b/mdps/ff_aa.mdp
@@ -107,7 +107,7 @@ print-nose-hoover-chain-variables = no
 ; Groups to couple separately
 tc-grps                  = system
 ; Time constant (ps) and reference temperature (K)
-tau_t                    = 1.0 ; 50.0
+tau_t                    = 25; can be modified to improve diffusion 
 ref_t                    = _TEMP_ 
 ; pressure coupling     
 Pcoupl                   = no

diff --git a/mdps/ff_rc.mdp b/mdps/ff_rc.mdp
@@ -107,7 +107,7 @@ print-nose-hoover-chain-variables = no
 ; Groups to couple separately
 tc-grps                  = system
 ; Time constant (ps) and reference temperature (K)
-tau_t                    = 1.0 ; 50.0
+tau_t                    = 25; can be modified to improve diffusion 
 ref_t                    = _TEMP_ 
 ; pressure coupling     
 Pcoupl                   = no

diff --git a/multiego.py b/multiego.py
@@ -10,7 +10,7 @@
     parser = argparse.ArgumentParser(description='TODO!')
     parser.add_argument('--system', type=str, required=True, help='Name of the system corresponding to system input folder.')
     parser.add_argument('--egos', choices=['rc', 'production'], required=True, help='Type of EGO.\n rc: creates a force-field for random coil simulations.\n production: creates a force-field combining random coil simulations and training simulations.')
-    parser.add_argument('--epsilon', type=float, choices=[float_range.FloatRange(0.0, 1.0)], help='Maximum interaction energy per contact.')
+    parser.add_argument('--epsilon', type=float, choices=[float_range.FloatRange(0.0, 1000.0)], help='Maximum interaction energy per contact.')
     parser.add_argument('--no_header', action='store_true', help='Removes headers from output_files when set')
     # This is to use epsilon as default for inter molecular epsilon and ligand epsilon
     args, remaining = parser.parse_known_args()
@@ -20,9 +20,11 @@
     parser.add_argument('--check_with', nargs='+', type=str, default=[], help='Those are contacts from a simulation or a structure used to check whether the contacts learned are compatible with the structures provided in here')
 
     parser.add_argument('--p_to_learn', type=float, default=0.9995, help='Amount of the simulation to learn.')
-    parser.add_argument('--fraction', type=float, default=0.2, help='Minimum fraction of the maximum interaction energy per contact.')
+    parser.add_argument('--epsilon_min', type=float, default=0.07, help='The minimum meaningfull epsilon value.')
 
     parser.add_argument('--inter_epsilon', type=float, default=args.epsilon, help='Maximum interaction energy per intermolecular contacts.')
+    parser.add_argument('--inter_domain_epsilon', type=float, default=args.epsilon, help='Maximum interaction energy per interdomain contacts.')
+    parser.add_argument('--out', type=str, default='', help='Suffix for the output directory name.')
     args = parser.parse_args()
 
     args.root_dir = os.path.dirname(os.path.abspath(__file__))
@@ -43,8 +45,16 @@
     if args.p_to_learn<0.9:
         print('WARNING: --p_to_learn should be high enough')
 
-    if args.fraction<0.1 or args.fraction>0.3:
-        print('--fraction should be between 0.1 and 0.3')
+    if args.egos != 'rc' and args.epsilon <= args.epsilon_min:
+        print('--epsilon must be greater than --epsilon_min')
+        sys.exit()
+
+    if args.egos != 'rc' and args.inter_domain_epsilon <= args.epsilon_min:
+        print('--inter_domain_epsilon must be greater than --epsilon_min')
+        sys.exit()
+
+    if args.egos != 'rc' and args.inter_epsilon <= args.epsilon_min:
+        print('--inter_epsilon must be greater than --epsilon_min')
         sys.exit()
 
     if not os.path.exists(f'{args.root_dir}/outputs'): os.mkdir(f'{args.root_dir}/outputs')
@@ -71,4 +81,4 @@
 
     meGO_LJ_14 = ensemble.make_pairs_exclusion_topology(meGO_ensemble, meGO_LJ_14)
 
-    io.write_model(meGO_ensemble, meGO_LJ, meGO_LJ_14, args, output_dir)
+    io.write_model(meGO_ensemble, meGO_LJ, meGO_LJ_14, args, output_dir, args.out)
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,3 @@
+ParmEd==3.4.3
+numpy==1.26.2
+pandas==2.1.3
diff --git a/src/multiego/ensemble.py b/src/multiego/ensemble.py
@@ -148,7 +148,7 @@ def initialize_molecular_contacts(contact_matrix, path, ensemble_molecules_idx_s
     contact_matrix = contact_matrix[~contact_matrix['ai'].astype(str).str.startswith('H')]
     contact_matrix = contact_matrix[~contact_matrix['aj'].astype(str).str.startswith('H')]
 
-    contact_matrix = contact_matrix[['molecule_name_ai', 'ai', 'molecule_name_aj', 'aj', 'distance', 'probability', 'cutoff']]
+    contact_matrix = contact_matrix[['molecule_name_ai', 'ai', 'molecule_name_aj', 'aj', 'distance', 'probability', 'cutoff', 'intra_domain']]
     if name[0] == 'intramat': contact_matrix['same_chain'] = True
     elif name[0] == 'intermat': contact_matrix['same_chain'] = False
     else: raise Exception('There might be an error in the contact matrix naming. It must be intermat_X_X or intramat_X_X')
@@ -168,17 +168,20 @@ def initialize_molecular_contacts(contact_matrix, path, ensemble_molecules_idx_s
             md_threshold = 1
             rc_threshold = 1
         else:
+            #find md threshold 
             p_sort_normalized = np.cumsum( p_sort ) / norm
-            #find md/rc threshold 
-            md_threshold = p_sort[ np.min( np.where( p_sort_normalized > args.p_to_learn )[0]) ] 
-            rc_threshold = md_threshold**(1./(1.-args.fraction))
-        print('\t\t-', f'Set md_threshold = {md_threshold}')
-        print('\t\t-', f'Set rc_threshold = {rc_threshold}')
+            md_threshold = p_sort[ np.min( np.where( p_sort_normalized > args.p_to_learn )[0]) ]
 
         #add the columns for rc, md threshold
         contact_matrix['md_threshold'] = np.zeros(len(p_sort))+md_threshold
-        contact_matrix['rc_threshold'] = np.zeros(len(p_sort))+rc_threshold
-        contact_matrix['limit_rc'] = 1./contact_matrix['rc_threshold']**args.fraction
+        contact_matrix['rc_threshold'] = np.zeros(len(p_sort))
+        ## TODO extend it to inter-domain cases
+        contact_matrix.loc[(contact_matrix['same_chain']==True) & (contact_matrix['intra_domain']), 'rc_threshold'] = md_threshold**(1./(1.-(args.epsilon_min/args.epsilon)))
+        contact_matrix.loc[(contact_matrix['same_chain']==True) & ~(contact_matrix['intra_domain']), 'rc_threshold'] = md_threshold**(1./(1.-(args.epsilon_min/args.inter_domain_epsilon)))
+        contact_matrix.loc[(contact_matrix['same_chain']==False), 'rc_threshold'] = md_threshold**(1./(1.-(args.epsilon_min/args.inter_epsilon)))
+        contact_matrix.loc[(contact_matrix['same_chain']==True) & (contact_matrix['intra_domain']), 'limit_rc'] = 1./contact_matrix['rc_threshold']**(args.epsilon_min/args.epsilon)
+        contact_matrix.loc[(contact_matrix['same_chain']==True) & ~(contact_matrix['intra_domain']), 'limit_rc'] = 1./contact_matrix['rc_threshold']**(args.epsilon_min/args.inter_domain_epsilon)
+        contact_matrix.loc[(contact_matrix['same_chain']==False), 'limit_rc'] = 1./contact_matrix['rc_threshold']**(args.epsilon_min/args.inter_epsilon)
 
     return contact_matrix
 
@@ -629,15 +632,17 @@ def generate_LJ(meGO_ensemble, train_dataset, check_dataset, parameters):
     # Epsilon reweight based on probability
     # Paissoni Equation 2.1
     # Attractive intramolecular
-    meGO_LJ.loc[(meGO_LJ['probability']>meGO_LJ['limit_rc']*np.maximum(meGO_LJ['rc_probability'],meGO_LJ['rc_threshold']))&(meGO_LJ['same_chain']==True), 'epsilon'] = -(parameters.epsilon/np.log(meGO_LJ['rc_threshold']))*(np.log(meGO_LJ['probability']/np.maximum(meGO_LJ['rc_probability'],meGO_LJ['rc_threshold'])))
+    meGO_LJ.loc[(meGO_LJ['intra_domain']==True)&(meGO_LJ['probability']>meGO_LJ['limit_rc']*np.maximum(meGO_LJ['rc_probability'],meGO_LJ['rc_threshold']))&(meGO_LJ['same_chain']==True), 'epsilon'] = -(parameters.epsilon/np.log(meGO_LJ['rc_threshold']))*(np.log(meGO_LJ['probability']/np.maximum(meGO_LJ['rc_probability'],meGO_LJ['rc_threshold'])))
+    meGO_LJ.loc[(meGO_LJ['intra_domain']==False)&(meGO_LJ['probability']>meGO_LJ['limit_rc']*np.maximum(meGO_LJ['rc_probability'],meGO_LJ['rc_threshold']))&(meGO_LJ['same_chain']==True), 'epsilon'] = -(parameters.inter_domain_epsilon/np.log(meGO_LJ['rc_threshold']))*(np.log(meGO_LJ['probability']/np.maximum(meGO_LJ['rc_probability'],meGO_LJ['rc_threshold'])))
     # Attractive intermolecular
     meGO_LJ.loc[(meGO_LJ['probability']>meGO_LJ['limit_rc']*np.maximum(meGO_LJ['rc_probability'],meGO_LJ['rc_threshold']))&(meGO_LJ['same_chain']==False), 'epsilon'] = -(parameters.inter_epsilon/np.log(meGO_LJ['rc_threshold']))*(np.log(meGO_LJ['probability']/np.maximum(meGO_LJ['rc_probability'],meGO_LJ['rc_threshold'])))
 
 
     # General repulsive term
     # These are with negative sign to store them as epsilon values 
     # Intramolecular
-    meGO_LJ.loc[(meGO_LJ['probability']<np.maximum(meGO_LJ['rc_probability'],meGO_LJ['rc_threshold']))&(meGO_LJ['same_chain']==True)&(meGO_LJ['rep']>0), 'epsilon'] = -(parameters.epsilon/np.log(meGO_LJ['rc_threshold']))*meGO_LJ['distance']**12*np.log(meGO_LJ['probability']/np.maximum(meGO_LJ['rc_probability'],meGO_LJ['rc_threshold']))-(meGO_LJ['rep']*(meGO_LJ['distance']/meGO_LJ['rc_distance'])**12)
+    meGO_LJ.loc[(meGO_LJ['intra_domain']==True)&(meGO_LJ['probability']<np.maximum(meGO_LJ['rc_probability'],meGO_LJ['rc_threshold']))&(meGO_LJ['same_chain']==True)&(meGO_LJ['rep']>0), 'epsilon'] = -(parameters.epsilon/np.log(meGO_LJ['rc_threshold']))*meGO_LJ['distance']**12*np.log(meGO_LJ['probability']/np.maximum(meGO_LJ['rc_probability'],meGO_LJ['rc_threshold']))-(meGO_LJ['rep']*(meGO_LJ['distance']/meGO_LJ['rc_distance'])**12)
+    meGO_LJ.loc[(meGO_LJ['intra_domain']==False)&(meGO_LJ['probability']<np.maximum(meGO_LJ['rc_probability'],meGO_LJ['rc_threshold']))&(meGO_LJ['same_chain']==True)&(meGO_LJ['rep']>0), 'epsilon'] = -(parameters.inter_domain_epsilon/np.log(meGO_LJ['rc_threshold']))*meGO_LJ['distance']**12*np.log(meGO_LJ['probability']/np.maximum(meGO_LJ['rc_probability'],meGO_LJ['rc_threshold']))-(meGO_LJ['rep']*(meGO_LJ['distance']/meGO_LJ['rc_distance'])**12)
     # Intermolecular
     meGO_LJ.loc[(meGO_LJ['probability']<np.maximum(meGO_LJ['rc_probability'],meGO_LJ['rc_threshold']))&(meGO_LJ['same_chain']==False)&(meGO_LJ['rep']>0), 'epsilon'] = -(parameters.inter_epsilon/np.log(meGO_LJ['rc_threshold']))*meGO_LJ['distance']**12*np.log(meGO_LJ['probability']/np.maximum(meGO_LJ['rc_probability'],meGO_LJ['rc_threshold']))-(meGO_LJ['rep']*(meGO_LJ['distance']/meGO_LJ['rc_distance'])**12)
     # mid case for Pmd>Prc but not enough to be attractive 

diff --git a/src/multiego/io.py b/src/multiego/io.py
@@ -20,7 +20,8 @@ def read_molecular_contacts(path):
 
     print('\t-', f"Reading {path}")
     contact_matrix = pd.read_csv(path, header=None, sep='\s+')
-    contact_matrix.columns = ['molecule_number_ai', 'ai', 'molecule_number_aj', 'aj', 'distance', 'probability', 'cutoff']
+    if contact_matrix.shape[1] == 7: contact_matrix.insert(7, 7, 1)
+    contact_matrix.columns = ['molecule_number_ai', 'ai', 'molecule_number_aj', 'aj', 'distance', 'probability', 'cutoff', 'intra_domain']
     contact_matrix['molecule_number_ai'] = contact_matrix['molecule_number_ai'].astype(str)
     contact_matrix['ai'] = contact_matrix['ai'].astype(str)
     contact_matrix['molecule_number_aj'] = contact_matrix['molecule_number_aj'].astype(str)
@@ -62,7 +63,7 @@ def write_nonbonded(topology_dataframe, lj_potential, parameters, output_folder)
             lj_potential.drop(columns= ['molecule_name_ai', 'molecule_name_aj'], inplace=True)
             file.write(dataframe_to_write(lj_potential))
 
-def write_model(meGO_ensemble, meGO_LJ_potential, meGO_LJ_14, parameters, output_dir):
+def write_model(meGO_ensemble, meGO_LJ_potential, meGO_LJ_14, parameters, output_dir, suffix):
     '''
     Takes care of the final print-out and the file writing of topology and ffnonbonded
 
@@ -80,6 +81,7 @@ def write_model(meGO_ensemble, meGO_LJ_potential, meGO_LJ_14, parameters, output
         Path to the output directory 
     '''
     print('- Writing Multi-eGO model')
+    output_dir = f'{output_dir}'
     write_topology(meGO_ensemble['topology_dataframe'], meGO_ensemble['molecule_type_dict'], meGO_ensemble['meGO_bonded_interactions'], meGO_LJ_14, parameters, output_dir)
     write_nonbonded(meGO_ensemble['topology_dataframe'], meGO_LJ_potential, parameters, output_dir)
 
@@ -275,7 +277,10 @@ def create_output_directories(parameters):
     '''
     if parameters.egos == 'rc':
         name = f'{parameters.system}_{parameters.egos}'
-    else: name = f'{parameters.system}_{parameters.egos}_e{parameters.epsilon}_{parameters.inter_epsilon}'
+        if parameters.out: name = f'{parameters.system}_{parameters.egos}_{parameters.out}'
+    else: 
+        name = f'{parameters.system}_{parameters.egos}_e{parameters.epsilon}_{parameters.inter_epsilon}'
+        if parameters.out: name = f'{parameters.system}_{parameters.egos}_e{parameters.epsilon}_{parameters.inter_epsilon}_{parameters.out}'
     output_folder = f'{parameters.root_dir}/outputs/{name}'
 
     if not os.path.exists(output_folder):

diff --git a/test/test_outputs/abetaref_production_e0.35_0.35/ffnonbonded.itp b/test/test_outputs/abetaref_production_e0.35_0.35/ffnonbonded.itp
@@ -3,17 +3,19 @@
 ; https://github.com/multi-ego/multi-eGO
 ; Please read and cite:
 ; Scalone, E. et al. PNAS 119, e2203181119 (2022)
-; Created on the 25-10-2023 16:33
+; Created on the 24-11-2023 10:11
 ; With the following parameters:
 ;	- system          = abetaref            
 ;	- egos            = production          
 ;	- epsilon         = 0.35                
 ;	- train_from      = native_MD           
 ;	- check_with      =                     
 ;	- p_to_learn      = 0.9995              
-;	- fraction        = 0.2                 
+;	- epsilon_min     = 0.07                
 ;	- inter_epsilon   = 0.35                
-;	- root_dir        = /home/franbt/multi-eGO
+;	- inter_domain_epsilon = 0.35                
+;	- out             =                     
+;	- root_dir        = /Users/carlo/Codes/multi-eGO
 
 [ atomtypes ]
    ; sb_type  atomic_number    mass  charge ptype           c6          c12

diff --git a/test/test_outputs/abetaref_production_e0.35_0.35/topol_GRETA.top b/test/test_outputs/abetaref_production_e0.35_0.35/topol_GRETA.top
@@ -3,17 +3,19 @@
 ; https://github.com/multi-ego/multi-eGO
 ; Please read and cite:
 ; Scalone, E. et al. PNAS 119, e2203181119 (2022)
-; Created on the 25-10-2023 16:33
+; Created on the 24-11-2023 10:11
 ; With the following parameters:
 ;	- system          = abetaref            
 ;	- egos            = production          
 ;	- epsilon         = 0.35                
 ;	- train_from      = native_MD           
 ;	- check_with      =                     
 ;	- p_to_learn      = 0.9995              
-;	- fraction        = 0.2                 
+;	- epsilon_min     = 0.07                
 ;	- inter_epsilon   = 0.35                
-;	- root_dir        = /home/franbt/multi-eGO
+;	- inter_domain_epsilon = 0.35                
+;	- out             =                     
+;	- root_dir        = /Users/carlo/Codes/multi-eGO
 
 
 ; Include forcefield parameters

diff --git a/test/test_outputs/gpref_production_e0.35_0.35/ffnonbonded.itp b/test/test_outputs/gpref_production_e0.35_0.35/ffnonbonded.itp
@@ -3,17 +3,19 @@
 ; https://github.com/multi-ego/multi-eGO
 ; Please read and cite:
 ; Scalone, E. et al. PNAS 119, e2203181119 (2022)
-; Created on the 25-10-2023 16:33
+; Created on the 24-11-2023 10:11
 ; With the following parameters:
 ;	- system          = gpref               
 ;	- egos            = production          
 ;	- epsilon         = 0.35                
 ;	- train_from      = md_ensemble         
 ;	- check_with      =                     
 ;	- p_to_learn      = 0.9995              
-;	- fraction        = 0.2                 
+;	- epsilon_min     = 0.07                
 ;	- inter_epsilon   = 0.35                
-;	- root_dir        = /home/franbt/multi-eGO
+;	- inter_domain_epsilon = 0.35                
+;	- out             =                     
+;	- root_dir        = /Users/carlo/Codes/multi-eGO
 
 [ atomtypes ]
           ; sb_type  atomic_number    mass  charge ptype           c6          c12

diff --git a/test/test_outputs/gpref_production_e0.35_0.35/topol_GRETA.top b/test/test_outputs/gpref_production_e0.35_0.35/topol_GRETA.top
@@ -3,17 +3,19 @@
 ; https://github.com/multi-ego/multi-eGO
 ; Please read and cite:
 ; Scalone, E. et al. PNAS 119, e2203181119 (2022)
-; Created on the 25-10-2023 16:33
+; Created on the 24-11-2023 10:11
 ; With the following parameters:
 ;	- system          = gpref               
 ;	- egos            = production          
 ;	- epsilon         = 0.35                
 ;	- train_from      = md_ensemble         
 ;	- check_with      =                     
 ;	- p_to_learn      = 0.9995              
-;	- fraction        = 0.2                 
+;	- epsilon_min     = 0.07                
 ;	- inter_epsilon   = 0.35                
-;	- root_dir        = /home/franbt/multi-eGO
+;	- inter_domain_epsilon = 0.35                
+;	- out             =                     
+;	- root_dir        = /Users/carlo/Codes/multi-eGO
 
 
 ; Include forcefield parameters