diff --git a/price_parser/parser.py b/price_parser/parser.py index 2099c35..2a707ad 100644 --- a/price_parser/parser.py +++ b/price_parser/parser.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import re import string -from typing import Callable, Optional, Pattern, List, Tuple +from typing import Callable, Match, Optional, Pattern, List, Tuple from decimal import Decimal, InvalidOperation import attr @@ -36,11 +36,11 @@ def fromstring(cls, price: Optional[str], ``price`` string, it could be **preferred** over a value extracted from ``currency_hint`` string. """ - amount_text = extract_price_text(price) if price is not None else None + currency, source = _extract_currency_symbol(price, currency_hint) + amount_text = extract_price_text(price, currency if source == price else None) if price is not None else None amount_num = parse_number(amount_text) if amount_text is not None else None - currency = extract_currency_symbol(price, currency_hint) if currency is not None: - currency = currency.strip() + currency = currency.group(0).strip() return Price( amount=amount_num, currency=currency, @@ -120,11 +120,12 @@ def or_regex(symbols: List[str]) -> Pattern: _search_unsafe_currency = or_regex(OTHER_CURRENCY_SYMBOLS).search -def extract_currency_symbol(price: Optional[str], - currency_hint: Optional[str]) -> Optional[str]: +def _extract_currency_symbol(price: Optional[str], + currency_hint: Optional[str]) -> Optional[str]: """ - Guess currency symbol from extracted price and currency strings. - Return an empty string if symbol is not found. + Guess the currency symbol from extracted price and currency strings. + Return a (`match object`_, source_string) tuple with the symbol found and + the string where it was found, or (None, None) if no symbol is found. """ methods: List[Tuple[Callable, Optional[str]]] = [ (_search_safe_currency, price), @@ -142,17 +143,32 @@ def extract_currency_symbol(price: Optional[str], for meth, attr in methods: m = meth(attr) if attr else None if m: - return m.group(0) + return m, attr + + return None, None + +def extract_currency_symbol(price: Optional[str], + currency_hint: Optional[str]) -> Optional[str]: + """ + Guess currency symbol from extracted price and currency strings. + Return the symbol as found as a string, or None if no symbol is found. + """ + match, _ = _extract_currency_symbol(price, currency_hint) + if match: + return match.group(0) return None -def extract_price_text(price: str) -> Optional[str]: +def extract_price_text(price: str, currency_match: Optional[Match] = None) -> Optional[str]: """ Extract text of a price from a string which contains price and - maybe some other text. If multiple price-looking substrings are present, - the first is returned (FIXME: it is better to return a number - which is near a currency symbol). + maybe some other text. + + If a match object of the currency within the `price` string is provided, + amounts before or after the matched currency substring are prioritized. + Otherwise, if multiple price-looking substrings are present, the first is + returned. >>> extract_price_text("price: $12.99") '12.99' @@ -189,16 +205,39 @@ def extract_price_text(price: str) -> Optional[str]: """, price, re.VERBOSE) if m: return m.group(0).replace(' ', '') + + def number_from_match(m): + return m.group(1).strip(',.').strip() + + if currency_match is not None: + + m = re.search(r""" + (\d[\d\s.,]*) # number, probably with thousand separators + \s*$ # only match right before the currency symbol + """, price[:currency_match.start(0)], re.VERBOSE) + if m: + return number_from_match(m) + + m = re.search(r""" + ^\s* # only match right after the currency symbol + (\d[\d\s.,]*) # number, probably with thousand separators + \s* # skip whitespace + (?:[^%\d]|$) # capture next symbol - it shouldn't be % + """, price[currency_match.end(0):], re.VERBOSE) + if m: + return number_from_match(m) + m = re.search(r""" (\d[\d\s.,]*) # number, probably with thousand separators \s* # skip whitespace (?:[^%\d]|$) # capture next symbol - it shouldn't be % """, price, re.VERBOSE) - if m: - return m.group(1).strip(',.').strip() + return number_from_match(m) + if 'free' in price.lower(): return '0' + return None diff --git a/tests/test_price_parsing.py b/tests/test_price_parsing.py index af9d889..c41b585 100644 --- a/tests/test_price_parsing.py +++ b/tests/test_price_parsing.py @@ -618,7 +618,7 @@ def __eq__(self, other): Example('€', '€ 139.00', '€', '139.00', 139), Example('There are 163 products.', 'From 26 to 50 €', - '€', '26', 26), + '€', '50', 50), Example('Pris NOK 1 999,00', '139,00', 'NOK', '139,00', 139), Example('/sqft', '1.52', @@ -1901,13 +1901,13 @@ def __eq__(self, other): 'CHF', '19.90', 19.90), Example('', '530,42 Zł', 'Zł', '530,42', 530.42), + Example('3 Ausgaben für nur 14,85 EUR', '3 Ausgaben für nur 14,85 EUR', + 'EUR', '14,85', 14.85), ] PRICE_PARSING_EXAMPLES_XFAIL = [ # amount is picked as a price - Example('3 Ausgaben für nur 14,85 EUR', '3 Ausgaben für nur 14,85 EUR', - 'EUR', '14,85', 14.85), Example(None, 'Buy Now - 2 Litre Was $120.00 Now $60.00', '$', '60.00', 60), Example('Цена: уточняйте (мин. заказ: 1 )', 'Цена: уточняйте (мин. заказ: 1 )',