pyiron · freyso · Dec 8, 2023 · Dec 11, 2023 · Dec 11, 2023 · Dec 11, 2023
diff --git a/pyiron_atomistics/sphinx/parser_base.py b/pyiron_atomistics/sphinx/parser_base.py
@@ -0,0 +1,188 @@
+# coding: utf-8
+# Copyright (c) Max-Planck-Institut für Eisenforschung GmbH - Computational Materials Design (CM) Department
+# Distributed under the terms of "New BSD License", see the LICENSE file.
+
+__author__ = "Christoph Freysoldt"
+__copyright__ = (
+    "Copyright 2023, Max-Planck-Institut für Eisenforschung GmbH - "
+    "Computational Materials Design (CM) Department"
+)
+__version__ = "1.0"
+__maintainer__ = "Christoph Freysoldt"
+__email__ = "[email protected]"
+__status__ = "production"
+__date__ = "Dec 8, 2023"
+
+import re
+from types import GeneratorType
+import numpy
+
+class keyword_tree_parser:
+    """
+    A base class to parse files block by block via keyword-triggered
+    parsing routines organized in a tree. Parsing routines can
+    add more levels of keyword->parse function maps. The file
+    is read line by line on demand while parsing, so large files will not clobber
+    memory.
+
+    A parser routine can either return or yield (once!) to continue parsing. If it yields,
+    the rest of the routine (after yield) will be executed when the next keyword
+    of the current or a higher level is found.
+
+    Every parser routine MUST remove the keyword from the lineview.
+
+    A typical use will be
+
+    class my_parser(keyword_tree_parser):
+        def __init__(self,file)
+            super ().__init__({
+                    "key1" : self.parse_key1,
+                    "key2" : self.parse_key2  })
+            self.parse (file)
+
+    """
+    def __init__ (self,keylevels=[]):
+        if isinstance(keylevels,dict):
+            keylevels = [ keylevels ]
+        elif not isinstance(keylevels,list):
+            raise TypeError
+        self.keylevels = keylevels
+
+    def parse(self,filename):
+        """
+        Parse a file using the current keylevels
+
+        Args:
+            filename ... the filename of the file to parse
+        Returns: nothing
+        """
+        # --- initialization
+        if len(self.keylevels) == 0:
+           raise KeyError("No parsing functions available in keylevels")
+        filehandle = open(filename)
+        self.line = filehandle.__iter__ ()
+        self.lineview = ''
+        self.filename = filename
+        self.lineno = 0
+        while True:
+            for keymap in self.keylevels:
+                for key,func in keymap.items ():
+                    if key in self.lineview:
+                        self._cleanup(keymap)
+                        res = func ()
+                        if isinstance(res,GeneratorType):
+                            res.send (None)
+                            keymap['%finalize!'] = res
+                        break
+                else:
+                    continue
+                break
+            else:
+                try:
+                    self.lineview = next(self.line)
+                    self.lineno += 1
+                    self.line_from = self.lineno
+                except StopIteration:
+                    break
+        self._cleanup(self.keylevels[0])
+        if (hasattr(self,'finalize')):
+            self.finalize ()
+
+    def location(self):
+        """ Return the current parsing location (for error messages)"""
+        return f"in file '{self.filename}' line" + \
+               ( f" {self.lineno}" if self.lineno == self.line_from else
+                 f"s {self.line_from}..{self.lineno}" )
+
+    def read_until(self, match):
+        """
+        Appends more lines from input until match is found
+
+        Args:
+           match ... (str) what to wait for
+        Returns: nothing
+        """
+        while not match in self.lineview:
+            self.lineview += next(self.line)
+            self.lineno += 1
+            self.lineFrom = self.line
+
+    def extract_via_regex(self, regex):
+        """
+        Extracts and removes some text from current lineview
+
+        Args:
+           regex ... regular expression
+        Returns:
+           the extracted text
+        """
+        if isinstance(regex, str):
+            regex = re.compile (regex, re.DOTALL)
+        result = regex.search(self.lineview)
+        if result is None:
+            raise RuntimeError(f"Failed to extract '{regex.pattern}' "
+                               + self.location ()
+                               + "\n" + self.lineview)
+        self.lineview = regex.sub ('',self.lineview, count=1)
+        return result.group ()
+
+    def _cleanup(self, active):
+        """
+        (internal routine) remove levels below the current (active) level, and
+         call (optional) final blocks up to the current level
+
+        Args:
+           active ... the currently active map
+        Returns:
+           the extracted text
+        """
+        def try_finalize(keymap):
+            if '%finalize!' in keymap:
+                try:
+                    next(keymap['%finalize!'])
+                except StopIteration:
+                    pass
+                del keymap['%finalize!']
+        # roll back keylevels until active level
+        while (self.keylevels[-1] is not active):
+            try_finalize (self.keylevels[-1])
+            del self.keylevels[-1]
+        # and call optional finalize of currently active level
+        try_finalize(active)
+
+    def get_vector (self, key, txt):
+        """
+        (auxiliary function) Get a vector from 'key = [ ... ] ;'
+
+        Args:
+           key ... the key to look for
+        Returns:
+           one-dimensional vector containing the numbers
+        """
+        # get the relevant part between '=' and ';'
+        vecstring = re.sub ('.*' + key + r"\s*=\s*([^;]+);.*", r"\1", txt)
+        if vecstring is None:
+            raise RuntimeError(f"Cannot parse {key} from '{txt}' as vector "
+                               + self.location ())
+        # remove special characters [] , ; =
+        vecstring = re.sub(r"[][=,;$]",' ',vecstring)
+        return numpy.fromstring(vecstring, sep=' ')
+
+    def extract_var(self, key, startend='=;'):
+        """
+        Extract a block 'key = ... ;'
+
+        If the end pattern is not found in lineview, more lines are read.
+
+        Args:
+            key      ... the keyword
+            startend ... (optional) Override the = ; pair by two different patterns
+        Returns:
+            the extracted block
+        """
+        self.read_until (startend[1])
+        return self.extract_via_regex (key + r"\s*" + startend[0]
+                                       + r"\s*[^" + startend[1] + ']+'
+                                       + startend[1])
+
+
diff --git a/pyiron_atomistics/sphinx/structure.py b/pyiron_atomistics/sphinx/structure.py
@@ -2,100 +2,99 @@
 # Copyright (c) Max-Planck-Institut für Eisenforschung GmbH - Computational Materials Design (CM) Department
 # Distributed under the terms of "New BSD License", see the LICENSE file.
 
-from collections import OrderedDict
+import re
 import numpy as np
 import scipy.constants
+from pyiron_atomistics.atomistics.structure.parser_base import keyword_tree_parser
 from pyiron_atomistics.atomistics.structure.atoms import Atoms
 from pyiron_atomistics.atomistics.structure.periodic_table import PeriodicTable
 
-__author__ = "Sudarsan Surendralal, Jan Janssen"
+__author__ = "Christoph Freysoldt"
 __copyright__ = (
-    "Copyright 2021, Max-Planck-Institut für Eisenforschung GmbH - "
+    "Copyright 2023, Max-Planck-Institut für Eisenforschung GmbH - "
     "Computational Materials Design (CM) Department"
 )
-__version__ = "1.0"
-__maintainer__ = "Sudarsan Surendralal"
-__email__ = "surendralal@mpie.de"
+__version__ = "2.0"
+__maintainer__ = "Christoph Freysoldt"
+__email__ = "freysoldt@mpie.de"
 __status__ = "production"
-__date__ = "Feb 4, 2018"
+__date__ = "Dec 8, 2023"
 
 BOHR_TO_ANGSTROM = (
     scipy.constants.physical_constants["Bohr radius"][0] / scipy.constants.angstrom
 )
 
+class struct_parser(keyword_tree_parser):
+    """
+    This class reads one or more structures in sx format.
+    """
+    def __init__ (self, file):
+        super().__init__({'structure': self.parse_structure})
+        self.configs = []
+        self.parse (file)
+
+    def parse_structure(self):
+        """ Parses structure{} blocks"""
+        self.keylevels.append ({
+            'cell': self.parse_cell,
+            'species' : self.parse_species})
+        self.extract_via_regex('structure')
+        # --- initialize for next structure
+        self.cell = None
+        self.positions = []
+        self.species = []
+        self.indices = []
+        self.ispecies=-1
+        # continue parsing
+        yield
+        # create Atoms object and append it to configs
+        pse = PeriodicTable()
+        atoms = Atoms(
+            species=[pse.element (s) for s in self.species],
+            indices=self.indices,
+            cell=self.cell * BOHR_TO_ANGSTROM,
+            positions=np.array(self.positions) * BOHR_TO_ANGSTROM,
+            pbc=True
+        )
+        self.configs.append (atoms)
+
+    def parse_cell (self):
+        """ Read the cell"""
+        txt = self.extract_var('cell')
+        self.cell = self.get_vector('cell', txt).reshape (3,3)
+
+    def parse_species (self):
+        """ Parses species{} blocks"""
+        self.extract_via_regex('species')
+        self.keylevels.append ({
+            'element' : self.get_element,
+            'atom' : self.read_atom})
+        self.ispecies += 1
+
+    def get_element (self):
+        """Read element"""
+        txt=self.extract_var ('element')
+        self.species.append (re.sub ('.*"([^"]*)".*',r"\1",txt))
+
+    def read_atom(self):
+        """Read atomic coordinates from an atom block"""
+        txt=self.extract_var ('atom', '{}')
+        self.positions.append (self.get_vector('coords',txt))
+        self.indices.append (self.ispecies)
+        if 'label' in txt:
+            label = re.sub (r'.*label\s*=\s*"([^"]+)"\s*;.*', r"\1", txt)
+            print (f"atom {len(self.positions)} label={label}")
+
+
 
 def read_atoms(filename="structure.sx"):
     """
     Args:
         filename (str): Filename of the sphinx structure file
 
     Returns:
-        pyiron_atomistics.objects.structure.atoms.Atoms instance
+        pyiron_atomistics.objects.structure.atoms.Atoms instance (or a list of them)
 
     """
-    file_string = []
-    with open(filename) as f:
-        for line in f:
-            line = line.strip()
-            file_string.append(line)
-    cell_trigger = "cell"
-    cell_string = list()
-    species_list = list()
-    species_trigger = "element"
-    positions_dict = OrderedDict()
-    positions = list()
-    pse = PeriodicTable()
-    for i, line in enumerate(file_string):
-        if cell_trigger in line:
-            for j in range(len(file_string)):
-                line_str = file_string[i + j]
-                cell_string.append(line_str)
-                if ";" in line_str:
-                    break
-        if species_trigger in line:
-            species = (
-                line.strip().split("=")[-1].replace(";", "").replace('"', "").strip()
-            )
-            species_list.append(pse.element(species))
-            positions_dict[species] = 0
-            for j in range(len(file_string) - i):
-                line_str = file_string[i + j]
-                k = 0
-                if "atom" in line_str:
-                    break_loop = False
-                    while not break_loop:
-                        position_string = " ".join(
-                            file_string[i + j + k].split("=")[-1]
-                        )
-                        replace_list = ["[", "]", ";", "}", "movable", "X", "Y", "Z"]
-                        for rep in replace_list:
-                            position_string = (
-                                "".join(position_string).replace(rep, " ").split()
-                            )
-                        positions.append(
-                            np.array(position_string[0].split(","), dtype=float)
-                        )
-                        positions_dict[species] += 1
-                        k += 1
-                        if (i + j + k) <= len(file_string) - 1:
-                            if (
-                                "element" in file_string[i + j + k]
-                                or "atom" not in file_string[i + j + k]
-                            ):
-                                break_loop = True
-                    break
-    indices = list()
-    for i, val in enumerate(positions_dict.values()):
-        indices.append(np.ones(val, dtype=int) * i)
-    indices = np.hstack(indices)
-    replace_list = ["cell", "=", "[", "]", ",", ";"]
-    for rep in replace_list:
-        cell_string = " ".join(cell_string).replace(rep, " ").split()
-    cell = np.array(cell_string, dtype=float).reshape((3, 3)) * BOHR_TO_ANGSTROM
-    atoms = Atoms(
-        species=species_list,
-        indices=indices,
-        cell=cell,
-        positions=np.array(positions) * BOHR_TO_ANGSTROM,
-    )
-    return atoms
+    configs = struct_parser(filename).configs
+    return configs[0] if len(configs) == 1 else configs