merge: add previous commits on branch master

aphp · Jul 3, 2024 · 83ce9f9 · 83ce9f9
2 parents 0b4eda6 + 9d1a640
commit 83ce9f9
Show file tree

Hide file tree

Showing 87 changed files with 3,296 additions and 2,056 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -7,7 +7,8 @@ on:
     branches: [master]
 
 jobs:
-  Linting:
+  linting:
+    name: Linting
     if: github.event_name == 'pull_request'
     runs-on: ubuntu-latest
     steps:
@@ -16,88 +17,144 @@ jobs:
           # requites to grab the history of the PR
           fetch-depth: 0
       - uses: actions/setup-python@v4
-        with:
-          cache: 'pip'
       - uses: pre-commit/[email protected]
         with:
           extra_args: --color=always --from-ref ${{ github.event.pull_request.base.sha }} --to-ref ${{ github.event.pull_request.head.sha }}
 
-  Pytest:
+  pytest:
+    name: Pytest
     runs-on: ubuntu-latest
     strategy:
       fail-fast: true
       matrix:
         python-version: ["3.7", "3.8", "3.9"]
     steps:
       - uses: actions/checkout@v2
+
       - name: Cache downloaded resources
         uses: actions/cache@v3
         with:
           path: ~/.data/
           key: resources
+
+      # - name: Cache pip
+      #   uses: actions/cache@v3
+      #   with:
+      #     path: ~/.cache/pip
+      #     key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip
+
+      - run: echo WEEK=$(date +%V) >>$GITHUB_ENV
+        shell: bash
+
+      - uses: hynek/setup-cached-uv@v1
+        with:
+          cache-suffix: -tests-${{ matrix.python-version }}-${{ env.WEEK }}
+
       - name: Set up Java
         uses: actions/setup-java@v2
         with:
           distribution: "temurin" # See 'Supported distributions' for available options
           java-version: "8"
+
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
-          architecture: x64
-          cache: 'pip'
 
       - name: Install dependencies
         run: |
-          pip install --upgrade pip
-          pip install pipx
-          pipx install poetry
-          pip install -e '.[dev,setup]'
+          uv venv
+          source .venv/bin/activate
+          uv pip install -e '.[dev,setup]' pytest-xdist poetry pip
 
       - name: Test with Pytest on Python ${{ matrix.python-version }}
         env:
           UMLS_API_KEY: ${{ secrets.UMLS_API_KEY }}
-        run: python -m pytest --cov edsnlp --cov-report xml --ignore tests/test_docs.py
+        run: |
+          source .venv/bin/activate
+          coverage run -m pytest --ignore tests/test_docs.py # -n auto
+          # coverage combine
+          # mv .coverage .coverage.${{ matrix.python-version }}
         if: matrix.python-version != '3.9'
 
       - name: Test with Pytest on Python ${{ matrix.python-version }}
         env:
           UMLS_API_KEY: ${{ secrets.UMLS_API_KEY }}
-        run: python -m pytest --cov edsnlp --cov-report xml
+        run: |
+          source .venv/bin/activate
+          coverage run -m pytest # -n auto
+          # coverage combine
+          # mv .coverage .coverage.${{ matrix.python-version }}
         if: matrix.python-version == '3.9'
 
-      - name: Upload coverage
-        uses: codecov/codecov-action@v2
+      - name: Upload coverage data
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-data-${{ matrix.python-version }}
+          path: .coverage.*
+          if-no-files-found: ignore
 
+  coverage:
+    name: Coverage
+    needs: pytest
+    uses: aphp/foldedtensor/.github/workflows/coverage.yml@main
+    with:
+      base-branch: master
+      coverage-data-pattern: coverage-data-*
+      coverage-report: coverage.txt
+      coverage-badge: coverage.svg
+      coverage-branch: coverage
 
-  Documentation:
+  documentation:
+    name: Documentation
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v2
+
     - uses: actions/setup-python@v4
       with:
         python-version: "3.9"
-        cache: 'pip'
+        # cache: 'pip'
+
+    - run: echo WEEK=$(date +%V) >>$GITHUB_ENV
+      shell: bash
+
+    - uses: hynek/setup-cached-uv@v1
+      with:
+        cache-suffix: -docs-${{ matrix.python-version }}-${{ env.WEEK }}
+
     - name: Install dependencies
       run: |
-        python -m pip install --upgrade pip
-        pip install '.[dev]'
+        uv venv
+        uv pip install '.[dev]'
+
     - name: Build documentation
       run: |
+        source .venv/bin/activate
         mkdocs build --clean
 
-  Installation:
+  simple-installation:
+    name: Simple installation
     runs-on: ubuntu-latest
     strategy:
-      fail-fast: false
+      fail-fast: true
       matrix:
         python-version: ["3.7", "3.8", "3.9"]
     steps:
       - uses: actions/checkout@v2
+
       - uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
-          cache: 'pip'
+
+      - run: echo WEEK=$(date +%V) >>$GITHUB_ENV
+        shell: bash
+
+      - uses: hynek/setup-cached-uv@v1
+        with:
+          cache-suffix: -simple-install-${{ matrix.python-version }}-${{ env.WEEK }}
+
       - name: Install library
         run: |
-          pip install .
+          uv venv
+          uv pip install .
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 [![Documentation](https://img.shields.io/github/actions/workflow/status/aphp/edsnlp/documentation.yml?branch=master&label=docs&style=flat-square)](https://aphp.github.io/edsnlp/latest/)
 [![PyPI](https://img.shields.io/pypi/v/edsnlp?color=blue&style=flat-square)](https://pypi.org/project/edsnlp/)
 [![Demo](https://img.shields.io/badge/demo%20%F0%9F%9A%80-streamit-grean?style=flat-square)](https://aphp.github.io/edsnlp/demo/)
-[![Codecov](https://img.shields.io/codecov/c/github/aphp/edsnlp?logo=codecov&style=flat-square)](https://codecov.io/gh/aphp/edsnlp)
+[![Coverage](https://raw.githubusercontent.com/aphp/edsnlp/coverage/coverage.svg)](https://raw.githubusercontent.com/aphp/edsnlp/coverage/coverage.txt)
 [![DOI](https://zenodo.org/badge/467585436.svg)](https://zenodo.org/badge/latestdoi/467585436)
 
 
@@ -34,13 +34,13 @@ Check out our interactive [demo](https://aphp.github.io/edsnlp/demo/) !
 You can install EDS-NLP via `pip`. We recommend pinning the library version in your projects, or use a strict package manager like [Poetry](https://python-poetry.org/).
 
 ```shell
-pip install edsnlp==0.11.2
+pip install edsnlp==0.12.3
 ```
 
 or if you want to use the trainable components (using pytorch)
 
 ```shell
-pip install "edsnlp[ml]==0.11.2"
+pip install "edsnlp[ml]==0.12.3"
 ```
 
 ### A first pipeline

diff --git a/docs/index.md b/docs/index.md
@@ -15,13 +15,13 @@ Check out our interactive [demo](https://aphp.github.io/edsnlp/demo/) !
 You can install EDS-NLP via `pip`. We recommend pinning the library version in your projects, or use a strict package manager like [Poetry](https://python-poetry.org/).
 
 ```{: data-md-color-scheme="slate" }
-pip install edsnlp==0.11.2
+pip install edsnlp==0.12.3
 ```
 
 or if you want to use the trainable components (using pytorch)
 
 ```{: data-md-color-scheme="slate" }
-pip install "edsnlp[ml]==0.11.2"
+pip install "edsnlp[ml]==0.12.3"
 ```
 
 ### A first pipeline

diff --git a/docs/pipes/trainable/index.md b/docs/pipes/trainable/index.md
@@ -8,13 +8,13 @@ All trainable components implement the [`TorchComponent`][edsnlp.core.torch_comp
 
 <!-- --8<-- [start:components] -->
 
-| Name                 | Description                                                          |
-|----------------------|----------------------------------------------------------------------|
-| `eds.transformer`    | Embed text with a transformer model                                  |
-| `eds.text_cnn`       | Contextualize embeddings with a CNN                                  |
-| `eds.span_pooler`    | A span embedding component that aggregates word embeddings           |
-| `eds.ner_crf`        | A trainable component to extract entities                            |
-| `eds.span_qualifier` | A trainable component for multi-class multi-label span qualification |
-| `eds.span_linker`    | A trainable entity linker (i.e. to a list of concepts)               |
+| Name                  | Description                                                           |
+|-----------------------|-----------------------------------------------------------------------|
+| `eds.transformer`     | Embed text with a transformer model                                   |
+| `eds.text_cnn`        | Contextualize embeddings with a CNN                                   |
+| `eds.span_pooler`     | A span embedding component that aggregates word embeddings            |
+| `eds.ner_crf`         | A trainable component to extract entities                             |
+| `eds.span_classifier` | A trainable component for multi-class multi-label span classification |
+| `eds.span_linker`     | A trainable entity linker (i.e. to a list of concepts)                |
 
 <!-- --8<-- [end:components] -->
diff --git a/docs/pipes/trainable/span-classifier.md b/docs/pipes/trainable/span-classifier.md
@@ -0,0 +1,8 @@
+# Trainable Span Classifier {: #edsnlp.pipes.trainable.span_classifier.factory.create_component }
+
+::: edsnlp.pipes.trainable.span_classifier.factory.create_component
+    options:
+        heading_level: 2
+        show_bases: false
+        show_source: false
+        only_class_level: true
diff --git a/docs/pipes/trainable/span-qualifier.md b/docs/pipes/trainable/span-qualifier.md
diff --git a/docs/tutorials/make-a-training-script.md b/docs/tutorials/make-a-training-script.md
@@ -157,15 +157,15 @@ for step in tqdm(range(max_steps), "Training model", leave=True):
 ### 7. Optimizing the weights
 
 Inside the training loop, the trainable components are fed the collated batches from the dataloader by calling
-the [`TorchComponent.module_forward`][edsnlp.core.torch_component.TorchComponent.module_forward] methods to compute the losses. In the case we train a multi-task model (not in this tutorial), the
+the [`TorchComponent.forward`][edsnlp.core.torch_component.TorchComponent.forward] method (via a simple call) to compute the losses. In the case we train a multi-task model (not in this tutorial), the
 outputs of shared embedding are reused between components, we enable caching by wrapping this step in a cache context. The training loop is otherwise carried in a similar fashion to a standard pytorch
 training loop
 
 ```{ .python .no-check }
     with nlp.cache():
         loss = torch.zeros((), device="cpu")
         for name, component in nlp.torch_components():
-            output = component.module_forward(batch[name])  # (1)
+            output = component(batch[name])  # (1)
             if "loss" in output:
                 loss += output["loss"]
 
@@ -174,8 +174,6 @@ training loop
     optimizer.step()
 ```
 
-1. We use the `module_forward` instead of a standard call, since the `__call__` method of EDS-NLP components is used to run a component on a spaCy Doc.
-
 ### 8. Evaluating the model
 
 Finally, the model is evaluated on the validation dataset and saved at regular intervals.
@@ -296,7 +294,7 @@ Let's wrap the training code in a function, and make it callable from the comman
             loss = torch.zeros((), device="cpu")
             with nlp.cache():
                 for name, component in nlp.torch_components():
-                    output = component.module_forward(batch[name])
+                    output = component(batch[name])
                     if "loss" in output:
                         loss += output["loss"]
 

diff --git a/docs/tutorials/multiple-texts.md b/docs/tutorials/multiple-texts.md
@@ -83,6 +83,7 @@ for doc in docs:
             end=date.end_char,
             label="date",
             entity_text=date.text,
+            datetime=date._.date.datetime,
         )
         rows.append(d)
 df = pd.DataFrame(rows)
@@ -160,7 +161,8 @@ def convert_doc_to_rows(doc):
             begin=date.start_char,
             end=date.end_char,
             label="date",
-            entity_text=date.text,
+            lexical_variant=date.text,
+            datetime=date._.date.datetime,
         )
         entities.append(d)
 
@@ -172,7 +174,13 @@ df = docs.to_pandas(converter=convert_doc_to_rows)
 df = docs.to_pandas(
     converter="ents",
     span_getter=["ents", "dates"],
-    span_attributes=["negation", "hypothesis", "family"],
+    span_attributes={
+        # span._.*** name: column name
+        "negation": "negation",
+        "hypothesis": "hypothesis",
+        "family": "family",
+        "date.datetime": "datetime",
+    },
 )
 ```
 
@@ -263,10 +271,14 @@ note_nlp = docs.to_pandas(
     # Below are the arguments to the converter
     span_getter=["ents", "dates"],
     span_attributes={  # (1)
+        # span._.*** name: column name
         "negation": "negation",
         "hypothesis": "hypothesis",
         "family": "family",
-        "date.day": "date_day",  # slugified name
+        "date.datetime": "datetime",
+        # having individual columns for each date part
+        # can be useful for incomplete dates (eg, "in May")
+        "date.day": "date_day",
         "date.month": "date_month",
         "date.year": "date_year",
     },
@@ -308,9 +320,13 @@ note_nlp = docs.to_pandas(
         "negation": "negation",
         "hypothesis": "hypothesis",
         "family": "family",
-        "date.day": "date_day",  # slugify the extension name
+        "date.datetime": "datetime",
+
+        # having individual columns for each date part
+        # can be useful for incomplete dates (eg, "in May")
+        "date.day": "date_day",
         "date.month": "date_month",
-        "date.year": "date_year"
+        "date.year": "date_year",
     },
 )
 ```
@@ -336,9 +352,13 @@ note_nlp = docs.to_spark(
         "negation": "negation",
         "hypothesis": "hypothesis",
         "family": "family",
-        "date.day": "date_day",  # slugify the extension name
+        "date.datetime": "datetime",
+
+        # having individual columns for each date part
+        # can be useful for incomplete dates (eg, "in May")
+        "date.day": "date_day",
         "date.month": "date_month",
-        "date.year": "date_year"
+        "date.year": "date_year",
     },
     dtypes=None,  # (1)
 )