update kinetic comment parser for autogen trees

sevyharris · sevyharris · commit ae61efd3dfc6 · 2025-04-24T17:21:35.000-04:00
diff --git a/rmgpy/data/kinetics/database.py b/rmgpy/data/kinetics/database.py
@@ -752,42 +752,47 @@ def reconstruct_kinetics_from_source(self, reaction, source, fix_barrier_height=
                 else:
                     kinetics = training_entry.data
             elif 'Rate Rules' in source:
-
                 source_dict = source['Rate Rules'][1]
                 rules = source_dict['rules']
                 training = source_dict['training']
                 degeneracy = source_dict['degeneracy']
 
-                log_a = 0
-                n = 0
-                alpha = 0
-                E0 = 0
-                for rule_entry, weight in rules:
-                    log_a += np.log10(rule_entry.data.A.value_si) * weight
-                    n += rule_entry.data.n.value_si * weight
-                    alpha += rule_entry.data.alpha.value_si * weight
-                    E0 += rule_entry.data.E0.value_si * weight
-                for rule_entry, training_entry, weight in training:
-                    log_a += np.log10(rule_entry.data.A.value_si) * weight
-                    n += rule_entry.data.n.value_si * weight
-                    alpha += rule_entry.data.alpha.value_si * weight
-                    E0 += rule_entry.data.E0.value_si * weight
-
-                a_units = rule_entry.data.A.units
-                if a_units == 'cm^3/(mol*s)' or a_units == 'cm^3/(molecule*s)' or a_units == 'm^3/(molecule*s)':
-                    a_units = 'm^3/(mol*s)'
-                elif a_units == 'cm^6/(mol^2*s)' or a_units == 'cm^6/(molecule^2*s)' or a_units == 'm^6/(molecule^2*s)':
-                    a_units = 'm^6/(mol^2*s)'
-                elif a_units == 's^-1' or a_units == 'm^3/(mol*s)' or a_units == 'm^6/(mol^2*s)':
-                    pass
-                else:
-                    raise ValueError('Invalid units {0} for averaging kinetics.'.format(a_units))
-                kinetics = ArrheniusEP(
-                    A=(degeneracy * 10 ** log_a, a_units),
-                    n=n,
-                    alpha=alpha,
-                    E0=(E0 * 0.001, "kJ/mol"),
-                )
+                if rules and isinstance(rules[0][0].data, ArrheniusBM):
+                    # This is a rate rule with ArrheniusBM kinetics
+                    assert len(rules) == 1, "There should only be one rate rule for ArrheniusBM kinetics in the autogenerated trees"
+                    kinetics = rules[0][0].data
+                    kinetics.A.value_si *= degeneracy
+                else:  # ArrheniusEP kinetics
+                    log_a = 0
+                    n = 0
+                    alpha = 0
+                    E0 = 0
+                    for rule_entry, weight in rules:
+                        log_a += np.log10(rule_entry.data.A.value_si) * weight
+                        n += rule_entry.data.n.value_si * weight
+                        alpha += rule_entry.data.alpha.value_si * weight
+                        E0 += rule_entry.data.E0.value_si * weight
+                    for rule_entry, training_entry, weight in training:
+                        log_a += np.log10(rule_entry.data.A.value_si) * weight
+                        n += rule_entry.data.n.value_si * weight
+                        alpha += rule_entry.data.alpha.value_si * weight
+                        E0 += rule_entry.data.E0.value_si * weight
+                    a_units = rule_entry.data.A.units
+                    if a_units == 'cm^3/(mol*s)' or a_units == 'cm^3/(molecule*s)' or a_units == 'm^3/(molecule*s)':
+                        a_units = 'm^3/(mol*s)'
+                    elif a_units == 'cm^6/(mol^2*s)' or a_units == 'cm^6/(molecule^2*s)' or a_units == 'm^6/(molecule^2*s)':
+                        a_units = 'm^6/(mol^2*s)'
+                    elif a_units == 's^-1' or a_units == 'm^3/(mol*s)' or a_units == 'm^6/(mol^2*s)':
+                        pass
+                    else:
+                        raise ValueError('Invalid units {0} for averaging kinetics.'.format(a_units))
+                    
+                    kinetics = ArrheniusEP(
+                        A=(degeneracy * 10 ** log_a, a_units),
+                        n=n,
+                        alpha=alpha,
+                        E0=(E0 * 0.001, "kJ/mol"),
+                    )
             else:
                 raise ValueError("Source data must be either 'Library', 'PDep','Training', or 'Rate Rules'.")
 
diff --git a/rmgpy/data/kinetics/family.py b/rmgpy/data/kinetics/family.py
@@ -4442,24 +4442,29 @@ def extract_source_from_comments(self, reaction):
         """
         lines = reaction.kinetics.comment.split('\n')
 
-        exact = False
+        exact_rule = False
         template = None
         rules = None
         training_entries = None
         degeneracy = 1
 
-        regex = r"\[(.*)\]"  # only hit outermost brackets
+        training_reaction_pattern = r'Matched reaction\s*(\d+).*in.*training'
+        degeneracy_pattern = r'Multiplied by reaction path degeneracy\s*(\d+)'
+
         for line in lines:
-            if line.startswith('Matched'):
+            training_matches = re.search(training_reaction_pattern, line)
+            degeneracy_matches = re.search(degeneracy_pattern, line)
+
+            if training_matches is not None:
                 # Source of the kinetics is from training reaction
-                training_reaction_index = int(line.split()[2])
+                training_reaction_index = int(training_matches.group(1))
                 depository = self.get_training_depository()
                 training_entry = depository.entries[training_reaction_index]
                 # Perform sanity check that the training reaction's label matches that of the comments
                 if training_entry.label not in line:
-                    raise AssertionError('Reaction {0} uses kinetics from training reaction {1} '
-                                         'but does not match the training reaction {1} from the '
-                                         '{2} family.'.format(reaction, training_reaction_index, self.label))
+                    raise AssertionError(f'Reaction {reaction} uses kinetics from training reaction {training_reaction_index} '
+                                         f'but does not match the training reaction {training_reaction_index} from the '
+                                         f'{self.label} family.')
 
                 # Sometimes the matched kinetics could be in the reverse direction.....
                 if reaction.is_isomorphic(training_entry.item, either_direction=False, save_order=self.save_order):
@@ -4468,34 +4473,34 @@ def extract_source_from_comments(self, reaction):
                     reverse = True
                 return True, [self.label, training_entry, reverse]
 
-            elif line.startswith('Exact match'):
-                exact = True
-            elif line.startswith('Estimated'):
-                pass
-            elif line.startswith('Multiplied by'):
-                degeneracy = float(line.split()[-1])
+            if 'Exact match found for rate rule' in line:
+                exact_rule = True
+            if degeneracy_matches is not None:
+                degeneracy = float(degeneracy_matches.group(1))
 
         # Extract the rate rule information
         full_comment_string = reaction.kinetics.comment.replace('\n', ' ')
-
+        autogen_node_search_pattern = r'Estimated from node (.*)'
         # The rate rule string is right after the phrase 'for rate rule'
-        rate_rule_string = full_comment_string.split("for rate rule", 1)[1].strip()
-
-        if rate_rule_string[0] == '[':
-            # Get the contents of the capture group in the regex
-            # Remove any spaces which may be left over as a result of a line break
-            template_label = re.split(regex, rate_rule_string)[1].replace(' ', '')
+        template_pattern = r"for rate rule \[(.*)\]"  # only hit outermost brackets
+        autogen_node_matches = re.search(autogen_node_search_pattern, full_comment_string)
+        template_matches = re.search(template_pattern, full_comment_string)
+        if autogen_node_matches is not None:  # autogenerated trees
+            template_str = autogen_node_matches.group(1).split('Multiplied by reaction path degeneracy')[0].strip()
+            tokens = template_str.split()
+            if len(tokens) == 2:  # The node was probably split because wordwrap was turned off
+                assert len(template_str) > 115, 'The node name is too short to have been broken up by the chemkin writer'
+                template_str = ''.join(tokens)
+            elif len(tokens) > 2:  # warn the user the node is probably wrong
+                raise ValueError(f'The node name {template_str} has multiple spaces and cannot be parsed for reaction {reaction}.')
+            template = self.retrieve_template([template_str])
+        elif template_matches is not None:  # hand-built trees
+            template_label = template_matches.group(1)
+            template = self.retrieve_template(template_label.split(';'))
         else:
-            # If this has the line 'From training reaction # for rate rule node1;node2'
-            template_label = rate_rule_string.split()[0]
-
-        template = self.retrieve_template(template_label.split(';'))
+            raise ValueError(f'Could not find rate rule in comments for reaction {reaction}.')
         rules, training_entries = self.get_sources_for_template(template)
-
-        if not template:
-            raise ValueError('Could not extract kinetics source from comments for reaction {}.'.format(reaction))
-
-        source_dict = {'template': template, 'degeneracy': degeneracy, 'exact': exact,
+        source_dict = {'template': template, 'degeneracy': degeneracy, 'exact': exact_rule,
                        'rules': rules, 'training': training_entries}
 
         # Source of the kinetics is from rate rules