diff --git a/CHANGELOG.md b/CHANGELOG.md index 387dd167..ce041997 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,20 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## Unreleased +## [1.0.3] - 2024-07-20 + +### Fixed + +- `standardize_formula`: Fix incorrect display of additional formulas, including methane which + was shown as "H4C(aq)", other tri-anions (N3-, P3-), and a variety of haloacetic acids. For + example, tricholoracetic acid was previously shown as `'C2Cl3O2[-1]'` but will now display + as `'CCl3COO[-1]'`. + +### Added + +- `standardize_formula`: `pyEQL` can now parse ion formulas that contain unicode superscript + or subscript characters, which makes input even more flexible. For example, `"PO₄³⁻"` and `"Co²⁺"` + will now standardize correctly to `"PO4[-3]"` and `"Co[+2]"`, respectively. ### Changed diff --git a/docs/chemistry.md b/docs/chemistry.md index f51d9146..c209227d 100644 --- a/docs/chemistry.md +++ b/docs/chemistry.md @@ -23,7 +23,8 @@ Here are some examples: | Sodium Sulfate | "Na2(SO4)" or "Na2SO4" | "Na(SO4)(aq)" | | Sodium Ion | "Na+", "Na+1", "Na1+", or "Na[+]" | "Na[+1]" | | Magnesium Ion | "Mg+2", "Mg++", or "Mg[++]" | "Mg[+2]" | -| Methanol | "CH3OH", "CH4O" | "'CH3OH(aq)'" | +| Methanol | "CH3OH", "CH4O" | "CH3OH(aq)" | +| Phosphate Ion | "PO4-3", "PO₄³⁻" | "PO4[-3]" | Specifically, `standardize_formula` uses `Ion.from_formula().reduced_formla` (shown in the right hand column of the table) to identify solutes. Notice that for charged species, the charges are always placed inside square brackets @@ -33,6 +34,11 @@ by `(aq)` to disambiguate them from solids. ```{important} **When writing multivalent ion formulas, it is strongly recommended that you put the charge number AFTER the + or - sign** (e.g., type "Mg+2" NOT "Mg2+"). The latter formula is ambiguous - it could mean $Mg_2^+$ or $Mg^{+2}$ and it will be processed incorrectly into `Mg[+0.5]` + +There is **one exception** to the rule above. If you really want to list the charge number +first , you can use unicode superscript characters (e.g., "Co²⁺"), and `pyEQL` will understand +these regardless of the order of the `+` and the `2`. So you can write "Co²⁺" and it will be +correctly standardized to `Co[+2]` ``` (manual-testing)= diff --git a/src/pyEQL/utils.py b/src/pyEQL/utils.py index 69b5f555..7b1b9737 100644 --- a/src/pyEQL/utils.py +++ b/src/pyEQL/utils.py @@ -60,6 +60,18 @@ def standardize_formula(formula: str): be enclosed in square brackets to remove any ambiguity in the meaning of the formula. For example, 'Na+', 'Na+1', and 'Na[+]' will all standardize to "Na[+1]" """ + # fix permuted sign and charge number (e.g. Co2+) + for str, rep in zip(["²⁺", "³⁺", "⁴⁺", "²⁻", "³⁻", "⁴⁻"], ["+2", "+3", "+4", "-2", "-3", "-4"]): + formula = formula.replace(str, rep) + + # replace superscripts with non superscripts + for char, rep in zip("⁻⁺⁰¹²³⁴⁵⁶⁷⁸⁹", "-+0123456789"): + formula = formula.replace(char, rep) + + # replace subscripts with non subscripts + for char, rep in zip("₀₁₂₃₄₅₆₇₈₉", "0123456789"): + formula = formula.replace(char, rep) + sform = Ion.from_formula(formula).reduced_formula # TODO - manual formula adjustments. May be implemented upstream in pymatgen in the future @@ -81,15 +93,49 @@ def standardize_formula(formula: str): # thiocyanate elif sform == "CSN[-1]": sform = "SCN[-1]" - # triiodide + # triiodide, nitride, an phosphide elif sform == "I[-0.33333333]": sform = "I3[-1]" + elif sform == "N[-0.33333333]": + sform = "N3[-1]" + elif sform == "P[-0.33333333]": + sform = "P3[-1]" # formate elif sform == "HCOO[-1]": sform = "HCO2[-1]" # oxalate elif sform == "CO2[-1]": sform = "C2O4[-2]" + # triflate + elif sform == "CS(OF)3[-1]": + sform = "CF3SO3[-1]" + # haloacetic acids of F, Cl, Br, I + elif sform == "C2Cl3O2[-1]": + sform = "CCl3COO[-1]" + elif sform == "C2O2F3[-1]": + sform = "CF3COO[-1]" + elif sform == "C2I3O2[-1]": + sform = "CI3COO[-1]" + elif sform == "C2Br3O2[-1]": + sform = "CBr3COO[-1]" + + # Cl+F + elif sform == "C2Cl2O2F[-1]": + sform = "CFCl2COO[-1]" + elif sform == "C2Cl(OF)2[-1]": + sform = "CF2ClCOO[-1]" + + # Cl+Br + elif sform == "C2Br(ClO)2[-1]": + sform = "CBrCl2COO[-1]" + elif sform == "C2Br2ClO2[-1]": + sform = "CBr2ClCOO[-1]" + + # Cl+I + elif sform == "C2I(ClO)2[-1]": + sform = "CICl2COO[-1]" + elif sform == "C2I2ClO2[-1]": + sform = "CI2ClCOO[-1]" # TODO - consider adding recognition of special formulas like MeOH for methanol or Cit for citrate return sform diff --git a/tests/test_utils.py b/tests/test_utils.py index 58c3295b..2226923b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -26,12 +26,32 @@ def test_standardize_formula(): assert standardize_formula("H2PO4-") == "H2PO4[-1]" assert standardize_formula("SCN-") == "SCN[-1]" assert standardize_formula("I3-") == "I3[-1]" + assert standardize_formula("N3-") == "N3[-1]" + assert standardize_formula("P3-") == "P3[-1]" assert standardize_formula("HCOO-") == "HCO2[-1]" assert standardize_formula("CO2-1") == "C2O4[-2]" assert standardize_formula("C2O4--") == "C2O4[-2]" assert standardize_formula("H3PO4") == "H3PO4(aq)" assert standardize_formula("H2SO4") == "H2SO4(aq)" assert standardize_formula("HClO4") == "HClO4(aq)" + assert standardize_formula("CF3SO3-") == "CF3SO3[-1]" + # superscripts, subscripts, and permuted sign/charge number + assert standardize_formula("PO₄³⁻") == "PO4[-3]" + assert standardize_formula("Co²⁺") == "Co[+2]" + # haloacetic acids + assert standardize_formula("CCl3COO-") == "CCl3COO[-1]" + assert standardize_formula("CF3COO-") == "CF3COO[-1]" + assert standardize_formula("CI3COO-") == "CI3COO[-1]" + assert standardize_formula("CBr3COO-") == "CBr3COO[-1]" + # Cl+F + assert standardize_formula("CCl2FCOO-") == "CFCl2COO[-1]" + assert standardize_formula("CClF2COO-") == "CF2ClCOO[-1]" + # Cl+I + assert standardize_formula("CCl2ICOO-") == "CICl2COO[-1]" + assert standardize_formula("CClI2COO-") == "CI2ClCOO[-1]" + # Cl+Br + assert standardize_formula("CBrCl2COO-") == "CBrCl2COO[-1]" + assert standardize_formula("CBr2ClCOO-") == "CBr2ClCOO[-1]" def test_formula_dict():