Skip to content

Commit

Permalink
prevent regex groups to be entered, correct typo in README.md
Browse files Browse the repository at this point in the history
  • Loading branch information
joheli committed Feb 16, 2024
1 parent a6a832e commit 5bef9b2
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 10 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ tipps_*
helper
dumpster/
*.bat
.vscode/
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@ This should add the executable `rosinenpicker` to `PATH`, making it accessible f
Please type

```
rosenpicker -c config_file -d database_file
rosinenpicker -c config_file -d database_file
```

where `config_file` (default: `config.yml`) and `database_file` (default: `matches.db`) represent a yml-formatted configuration file (please see sample [config.yml](configs/config.yml), which is more or less self-explanatory) and a sqlite database file (automatically created if not present), respectively.

For help type

```
rosenpicker -h
rosinenpicker -h
```
9 changes: 6 additions & 3 deletions configs/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,15 @@ strategies:
# terms
terms:
# Chose names for the terms and associate each with a regex pattern or, alternatively,
# two regex patterns surrounding '~@~', which serves as a divider.
# two regex patterns surrounding '@@@', which serves as a divider.
# In the former case (i.e. only one regex pattern, no divider) matches to the regex are returned.
# In the latter case (i.e. two regex patterns, divider present) the two regex patterns are converted
# to groups surrounding a central "match-all" (.*) pattern. Only matches to the central group are returned.
# In the latter case (i.e. divider present) the one or two regex patterns are converted
# to groups surrounding a central "match-all" (.*) pattern. Only matches to the "match-all"-group
# are returned.
my_first_term: 'regex1'
my_second_term: 'regex2@@@regex3'
my_third_term: '@@@regex'
my_fourth_term: 'regex@@@'
# export format
# currently, the following formats are supported: csv, xlsx, html, json
export_format: 'xlsx'
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ build-backend = "hatchling.build"
exclude = [
"pdfs/",
"configs/",
"testconfigs/",
".github/",
"helper/",
"dumpster/",
Expand Down
32 changes: 28 additions & 4 deletions src/rosinenpicker/pydantic_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,27 @@ def non_empty_string(cls, v: str):
assert v != '', 'Must be a non-empty string'
return v.strip()

# @field_validator('terms')
# @classmethod
# def check_terms(cls, t: dict[str, str]):
# checks = [cls.is_regex(p) for _, p in t.items()]
# if not all(checks):
# raise ConfigError(f"Concerning {t!r}: No regex groups are allowed.")

@classmethod
def compile_regex(cls, p: str) -> re.Pattern:
try:
rgx = re.compile(p)
return rgx
except:
raise ConfigError(f"Concerning pattern {p}: this string cannot be used as a regex pattern!")
raise ConfigError(f"Concerning pattern '{p}': this string cannot be used as a regex pattern!")

@field_validator('file_name_pattern', 'file_content_pattern')
@classmethod
def selection_must_be_regex(cls, v: str):
v = v.strip()
cls.compile_regex(v) # if unsuccessful an error is thrown
if not cls.is_regex(v):
raise ConfigError(f"Pattern '{v}' cannot be used as a regex pattern; also, regex groups are not allowed!")
return v

@model_validator(mode='after')
Expand Down Expand Up @@ -69,7 +77,10 @@ def validate_file_format(cls, ff: str):
def is_regex(cls, patternstring: str) -> bool:
#breakpoint()
try:
re.compile(patternstring)
rgx = re.compile(patternstring)
# Also, do not allow regex groups
if rgx.groups > 0:
return False
except:
return False
return True
Expand All @@ -89,6 +100,9 @@ def is_regex(cls, patternstring: str) -> bool:
# the number of capture groups present.
# In case no capture groups have been formed, the second and third integers are set to -1.
def process_terms(cls, patternstring: str, divider: str = "@@@") -> tuple[re.Pattern, int, int]:
# if patternstrings contains groups, reject
if not cls.is_regex(patternstring):
raise ConfigError(f"Concerning '{patternstring}': cannot be used as regex pattern; also, regex groups are not allowed!")
# helper to check if pattern only consists of a matchall pattern
def matchall_only(s) -> bool:
return re.search("\.\*", s) and len(s) == 2
Expand All @@ -106,9 +120,19 @@ def matchall_only(s) -> bool:
return (cls.compile_regex(patternstring), -1, -1)
# process the patternstrings divided by divider
multiple_patternstrings = re.split(divider, patternstring)

# check if patternstring and multiple_patternstrings are valid regex patterns without groups
all_strings = multiple_patternstrings.copy()
all_strings.append(patternstring)
all_check = [cls.is_regex(s) for s in all_strings]
if not all(all_check):
raise ConfigError(f"Concerning one of '{all_strings!r}': cannot be used as regex pattern; also regex groups are not allowed!")

#breakpoint()

# do any of the patternstrings only contain a matchall pattern?
if any([matchall_only(p) for p in multiple_patternstrings]):
raise ConfigError(msg=f"At least one of {multiple_patternstrings!r} only consists of a matchall-pattern '.*' and can therefore not be processed.")
raise ConfigError(msg=f"At least one of '{multiple_patternstrings!r}' only consists of a matchall-pattern '.*' and can therefore not be processed.")
# is any of the patternstrings of length 0?
lenx = [len(i) for i in multiple_patternstrings]
lenx0 = [l == 0 for l in lenx]
Expand Down
2 changes: 1 addition & 1 deletion src/rosinenpicker/start.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '0.0.5'
__version__ = '0.1.0'
import yaml
import re
import os
Expand Down

0 comments on commit 5bef9b2

Please sign in to comment.