Added example about how to write back the parsed tree, as requested a…

…t issue #3
inab · Nov 21, 2024 · 2a9d0b4 · 2a9d0b4
1 parent 19cad56
commit 2a9d0b4
Show file tree

Hide file tree

Showing 6 changed files with 209 additions and 27 deletions.
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
@@ -10,11 +10,11 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [ "3.7", "3.8", "3.9", "3.10", "3.11" ]
+        python-version: [ "3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13" ]
     name: Pre-commit python ${{ matrix.python-version }}
     steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
         id: cachepy
         with:
           python-version: ${{ matrix.python-version }}
@@ -34,17 +34,17 @@ jobs:
       - name: 'Install dev requirements'
         run: pip install -r dev-requirements.txt -r mypy-requirements.txt
       - name: MyPy cache
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: .mypy_cache/${{ matrix.python-version }}
           key: mypy-${{ matrix.python-version }}
       - name: 'pre-commit'
-        uses: pre-commit/[email protected].0
+        uses: pre-commit/[email protected].1
 #        if: ${{ matrix.python-version != '3.6' }}
         with:
           extra_args: --all -c .pre-commit-config.yaml
  #     - name: 'pre-commit (custom Python ${{ matrix.python-version }})'
- #       uses: pre-commit/[email protected].0
+ #       uses: pre-commit/[email protected].1
  #       if: ${{ matrix.python-version == '3.6' }}
  #       with:
  #         extra_args: --all -c .pre-commit-config-gh-${{ matrix.python-version }}.yaml
@@ -60,7 +60,7 @@ jobs:
       - name: Print licences report
         if: ${{ always() }}
         run: echo "${{ steps.license_check_report.outputs.report }}"
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           retention-days: 2
           path: constraints-${{ matrix.python-version }}.txt
@@ -71,8 +71,8 @@ jobs:
     needs:
       - pre-commit
     steps:
-      - uses: actions/checkout@v3
-      - uses: actions/download-artifact@v3
+      - uses: actions/checkout@v4
+      - uses: actions/download-artifact@v4
         with:
           path: changes-dir
       - name: Move artifacts to their right place
@@ -81,7 +81,7 @@ jobs:
           rm -r changes-dir/artifact
       - name: Create Pull Request
         id: cpr
-        uses: peter-evans/create-pull-request@v4
+        uses: peter-evans/create-pull-request@v7
         with:
           title: Updated constraints (triggered by ${{ github.sha }})
           delete-branch: true

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -31,7 +31,7 @@ repos:
 #        args: [--strict, --show-error-codes, --no-warn-unused-ignores, --python-executable, .pyWEenv/bin/python]
 #  - repo: meta
   - repo: https://github.com/jmfernandez/pre-commit_mirrors-actionlint.git
-    rev: v1.6.24
+    rev: v1.7.1
     hooks:
       - id: actionlint
 

diff --git a/README.md b/README.md
@@ -22,12 +22,12 @@ pip install git+https://github.com/inab/python-groovy-parser.git
 
 ## Test programs
 
-This repo contains a couple of test programs called
-[translated-groovy3-parser.py](translated-groovy3-parser.py) and
-[cached-translated-groovy3-parser.py](cached-translated-groovy3-parser.py),
+This repo contains three test programs called
+[translated-groovy3-parser.py](translated-groovy3-parser.py),
+[cached-translated-groovy3-parser.py](cached-translated-groovy3-parser.py) and [parser-groovy-writer.py](parser-groovy-writer.py),
 which demonstrate how to use the parser and digest it a bit.
 
-The programs take one or more files as input.
+All the programs take one or more files as input.
 
 ```bash
 git pull https://github.com/nf-core/rnaseq.git
@@ -42,8 +42,8 @@ Also, when the parsing task worked properly, it condenses and serializes
 the parse tree into a file with extension `.lark.json` (for instance,
 `rnaseq/modules/local/bedtools_genomecov.nf.lark.json`).
 
-And as a proof of concept, it tries to identify features from Nextflow files,
-like the declared processes, includes and workflows, and they are roughly printed
+The first two programs try, as a proof of concept, to identify features from Nextflow files,
+like the declared `process`, `include` and `workflow`, and they are roughly printed
 at a file with extension `.lark.result` (for instance `rnaseq/modules/local/bedtools_genomecov.nf.lark.result`).
 
 As parsing task is heavy, the parsing module also contains a method to
@@ -59,6 +59,13 @@ The caching directory contents depend on the grammar and the implementations, as
 So, if this software is updated (due grammar is updated or a bug is fixed),
 cached contents from previous versions are not reused.
 
+The third program `parser-groovy-writer.py` was written thinking on a request from an 
+issue, where the issuer wanted to write back the parsed tree after some processing.
+So, this program writes in a new file with extension `.mirrored` what it survived the parsing.
+In the current implementation there are some elements, like comments and some combinations of
+whitespaces, which are not propagated from the tokenizer to the lexer and parser,
+so they are not reintegrated.
+
 # Acknowledgements
 
 The tokenizer is an evolution from Pygments Groovy lexer https://github.com/pygments/pygments/blob/b7c8f35440f591c6687cb912aa223f5cf37b6704/pygments/lexers/jvm.py#L543-L618

diff --git a/groovy_parser/__init__.py b/groovy_parser/__init__.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 
 # SPDX-License-Identifier: Apache-2.0
-# Copyright (C) 2023 Barcelona Supercomputing Center, José M. Fernández
+# Copyright (C) 2024 Barcelona Supercomputing Center, José M. Fernández
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@
 # limitations under the License.
 
 __author__ = "José M. Fernández <https://orcid.org/0000-0002-4806-5140>"
-__copyright__ = "© 2023 Barcelona Supercomputing Center (BSC), ES"
+__copyright__ = "© 2024 Barcelona Supercomputing Center (BSC), ES"
 __license__ = "Apache-2.0"
 
 # https://www.python.org/dev/peps/pep-0396/
-__version__ = "0.1.1"
+__version__ = "0.1.2"
diff --git a/groovy_parser/parser.py b/groovy_parser/parser.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 
 # SPDX-License-Identifier: Apache-2.0
-# Copyright (C) 2023 Barcelona Supercomputing Center, José M. Fernández
+# Copyright (C) 2024 Barcelona Supercomputing Center, José M. Fernández
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -118,11 +118,23 @@ def default(  # type: ignore[override]
                     and isinstance(children[0], LarkTree)
                     and children[0].data not in noflat
                 ):
-                    return self.default(children[0], rule=new_rule)
+                    return self.default(
+                        children[0],
+                        rule=new_rule,
+                        prune=prune,
+                        noflat=noflat,
+                    )
                 else:
                     return {
                         "rule": new_rule,
-                        "children": [self.default(child) for child in children],
+                        "children": [
+                            self.default(
+                                child,
+                                prune=prune,
+                                noflat=noflat,
+                            )
+                            for child in children
+                        ],
                     }
             else:
                 # No children!!!!!!!
@@ -176,8 +188,16 @@ def parse_groovy_content(content: "str") -> "ParseTree":
     return tree
 
 
-def digest_lark_tree(tree: "ParseTree") -> "Union[RuleNode, LeafNode, EmptyNode]":
-    return LarkFilteringTreeEncoder().default(tree)
+def digest_lark_tree(
+    tree: "ParseTree",
+    prune: "Sequence[str]" = ["sep", "nls"],
+    noflat: "Sequence[str]" = ["script_statement"],
+) -> "Union[RuleNode, LeafNode, EmptyNode]":
+    return LarkFilteringTreeEncoder().default(
+        tree,
+        prune=prune,
+        noflat=noflat,
+    )
 
 
 SIGNATURE_FILES = [
@@ -196,7 +216,10 @@ def digest_lark_tree(tree: "ParseTree") -> "Union[RuleNode, LeafNode, EmptyNode]
 
 
 def parse_and_digest_groovy_content(
-    content: "str", cache_directory: "Optional[str]" = None
+    content: "str",
+    cache_directory: "Optional[str]" = None,
+    prune: "Sequence[str]" = ["sep", "nls"],
+    noflat: "Sequence[str]" = ["script_statement"],
 ) -> "Union[RuleNode, LeafNode, EmptyNode]":
     t_tree: "Optional[Union[RuleNode, LeafNode, EmptyNode]]" = None
     hashfile: "Optional[str]" = None
@@ -240,7 +263,11 @@ def parse_and_digest_groovy_content(
 
     if t_tree is None:
         tree = parse_groovy_content(content)
-        t_tree = LarkFilteringTreeEncoder().default(tree)
+        t_tree = LarkFilteringTreeEncoder().default(
+            tree,
+            prune=prune,
+            noflat=noflat,
+        )
 
         if hashfile is not None:
             with gzip.open(hashfile, mode="wt", encoding="utf-8") as jH:

diff --git a/parser-groovy-writer.py b/parser-groovy-writer.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# SPDX-License-Identifier: Apache-2.0
+# groovy-parser, a proof of concept Groovy parser based on Pygments and Lark
+# Copyright (C) 2024 Barcelona Supercomputing Center, José M. Fernández
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+import os
+import re
+import sys
+
+from typing import (
+    cast,
+    NamedTuple,
+    TYPE_CHECKING,
+)
+
+if TYPE_CHECKING:
+    from typing import (
+        IO,
+        Iterator,
+        MutableSequence,
+        Optional,
+        Sequence,
+        Tuple,
+        Union,
+    )
+
+    from groovy_parser.parser import (
+        EmptyNode,
+        LeafNode,
+        RuleNode,
+    )
+
+from pygments.token import Token
+
+from groovy_parser.parser import (
+    parse_groovy_content,
+    digest_lark_tree,
+)
+
+from lark import (
+    Lark,
+    Transformer,
+    v_args,
+)
+from lark.visitors import Discard
+
+prev_wants_space = False
+
+
+def write_groovy(
+    t_tree: "Union[RuleNode, LeafNode, EmptyNode]",
+    mH: "IO[str]",
+    reset_prev_wants_space: "bool" = False,
+) -> None:
+    global prev_wants_space
+    if reset_prev_wants_space:
+        prev_wants_space = False
+    children = cast("RuleNode", t_tree).get("children")
+    if children is not None:
+        for child in children:
+            write_groovy(child, mH=mH)
+    else:
+        leaf = cast("LeafNode", t_tree).get("leaf")
+        value = cast("LeafNode", t_tree).get("value")
+        if value is not None and leaf is not None:
+            wants_space = False
+            print(f"Leaf {leaf} value {value}")
+            if prev_wants_space and leaf in (
+                "STRING_LITERAL",
+                "IDENTIFIER",
+                "CAPITALIZED_IDENTIFIER",
+                "LBRACE",
+                "GSTRING_BEGIN",
+            ):
+                # These whitespace separators were silenced
+                mH.write(" ")
+
+            if leaf == "STRING_LITERAL":
+                mH.write("'")
+
+            mH.write(value)
+            if leaf in ("IDENTIFIER", "CAPITALIZED_IDENTIFIER", "RBRACE", "COMMA"):
+                wants_space = True
+            elif leaf == "STRING_LITERAL":
+                mH.write("'")
+
+            prev_wants_space = wants_space
+
+
+def mirror_groovy_source(
+    filename: "str", jsonfile: "str", mirror_filename: "str"
+) -> "Union[RuleNode, LeafNode, EmptyNode]":
+    with open(filename, mode="r", encoding="utf-8") as wfH:
+        content = wfH.read()
+
+    tree = parse_groovy_content(content)
+
+    t_tree = digest_lark_tree(tree, prune=[])
+
+    # These are for debugging purposes
+    # logging.debug(tree.pretty())
+    # with open(jsonfile, mode="w", encoding="utf-8") as jH:
+    #    json.dump(tree, jH, indent=4, cls=LarkFilteringTreeEncoder)
+    with open(jsonfile, mode="w", encoding="utf-8") as jH:
+        json.dump(t_tree, jH, indent=4)
+
+    with open(mirror_filename, mode="w", encoding="utf-8") as mH:
+        write_groovy(t_tree, mH, reset_prev_wants_space=True)
+
+    return t_tree
+
+
+if __name__ == "__main__":
+    logging.basicConfig(
+        level=logging.DEBUG,
+    )
+    log = logging.getLogger()  # root logger
+    for filename in sys.argv[1:]:
+        print(f"* Parsing {filename}")
+        logfile = filename + ".lark"
+        jsonfile = logfile + ".json"
+        mirrored_filename = filename + ".mirrored"
+        fH = logging.FileHandler(logfile, mode="w", encoding="utf-8")
+        for hdlr in log.handlers[:]:  # remove all old handlers
+            log.removeHandler(hdlr)
+        log.addHandler(fH)  # set the new handler
+        try:
+            mirror_groovy_source(filename, jsonfile, mirrored_filename)
+        except Exception as e:
+            print(f"\tParse failed, see {logfile}")
+            logging.exception("Parse failed")
+        fH.close()