diff --git a/price_parser/parser.py b/price_parser/parser.py index 1a17349..9336d72 100644 --- a/price_parser/parser.py +++ b/price_parser/parser.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import re import string -from typing import Callable, Optional, Pattern, List, Tuple +from typing import Callable, Match, Optional, Pattern, List, Tuple from decimal import Decimal, InvalidOperation import attr @@ -36,11 +36,17 @@ def fromstring(cls, price: Optional[str], ``price`` string, it could be **preferred** over a value extracted from ``currency_hint`` string. """ - amount_text = extract_price_text(price) if price is not None else None + currency_match, source = _extract_currency_symbol(price, currency_hint) + if price is not None: + _currency_match = currency_match if source == price else None + amount_text = extract_price_text(price, _currency_match) + else: + amount_text = None amount_num = parse_number(amount_text) if amount_text is not None else None - currency = extract_currency_symbol(price, currency_hint) - if currency is not None: - currency = currency.strip() + if currency_match is not None: + currency = currency_match.group(0).strip() + else: + currency = None return Price( amount=amount_num, currency=currency, @@ -120,11 +126,11 @@ def or_regex(symbols: List[str]) -> Pattern: _search_unsafe_currency = or_regex(OTHER_CURRENCY_SYMBOLS).search -def extract_currency_symbol(price: Optional[str], - currency_hint: Optional[str]) -> Optional[str]: +def _extract_currency_symbol(price: Optional[str], currency_hint: Optional[str]) -> Tuple[Optional[Match], Optional[str]]: """ - Guess currency symbol from extracted price and currency strings. - Return an empty string if symbol is not found. + Guess the currency symbol from extracted price and currency strings. + Return a (`match object`_, source_string) tuple with the symbol found and + the string where it was found, or (None, None) if no symbol is found. """ methods: List[Tuple[Callable, Optional[str]]] = [ (_search_safe_currency, price), @@ -142,17 +148,32 @@ def extract_currency_symbol(price: Optional[str], for meth, attr in methods: m = meth(attr) if attr else None if m: - return m.group(0) + return m, attr + + return None, None + +def extract_currency_symbol(price: Optional[str], + currency_hint: Optional[str]) -> Optional[str]: + """ + Guess currency symbol from extracted price and currency strings. + Return the symbol as found as a string, or None if no symbol is found. + """ + match, _ = _extract_currency_symbol(price, currency_hint) + if match: + return match.group(0) return None -def extract_price_text(price: str) -> Optional[str]: +def extract_price_text(price: str, currency_match: Optional[Match] = None) -> Optional[str]: """ Extract text of a price from a string which contains price and - maybe some other text. If multiple price-looking substrings are present, - the first is returned (FIXME: it is better to return a number - which is near a currency symbol). + maybe some other text. + + If a match object of the currency within the `price` string is provided, + amounts before or after the matched currency substring are prioritized. + Otherwise, if multiple price-looking substrings are present, the first is + returned. >>> extract_price_text("price: $12.99") '12.99' @@ -189,16 +210,39 @@ def extract_price_text(price: str) -> Optional[str]: """, price, re.VERBOSE) if m: return m.group(0).replace(' ', '') + + def number_from_match(m): + return m.group(1).strip(',.').strip() + + if currency_match is not None: + + m = re.search(r""" + (\d[\d\s.,]*) # number, probably with thousand separators + \s*$ # only match right before the currency symbol + """, price[:currency_match.start(0)], re.VERBOSE) + if m: + return number_from_match(m) + + m = re.search(r""" + ^\s* # only match right after the currency symbol + (\d[\d\s.,]*) # number, probably with thousand separators + \s* # skip whitespace + (?:[^%\d]|$) # capture next symbol - it shouldn't be % + """, price[currency_match.end(0):], re.VERBOSE) + if m: + return number_from_match(m) + m = re.search(r""" (\d[\d\s.,]*) # number, probably with thousand separators \s* # skip whitespace (?:[^%\d]|$) # capture next symbol - it shouldn't be % """, price, re.VERBOSE) - if m: - return m.group(1).strip(',.').strip() + return number_from_match(m) + if 'free' in price.lower(): return '0' + return None diff --git a/tests/test_price_parsing.py b/tests/test_price_parsing.py index 4875b11..026915f 100644 --- a/tests/test_price_parsing.py +++ b/tests/test_price_parsing.py @@ -17,6 +17,7 @@ import pytest from price_parser import Price +from price_parser.parser import extract_currency_symbol class Example(Price): @@ -625,8 +626,6 @@ def __eq__(self, other): 'Р', '30', 30), Example('€', '€ 139.00', '€', '139.00', 139), - Example('There are 163 products.', 'From 26 to 50 €', - '€', '26', 26), Example('Pris NOK 1 999,00', '139,00', 'NOK', '139,00', 139), Example('/sqft', '1.52', @@ -1909,15 +1908,55 @@ def __eq__(self, other): 'CHF', '19.90', 19.90), Example('', '530,42 Zł', 'Zł', '530,42', 530.42), + + # Prefer values next to currency symbols + Example('3 Ausgaben für nur 14,85 EUR', '3 Ausgaben für nur 14,85 EUR', + 'EUR', '14,85', 14.85), + Example(None, '2 items at 24,00€', + '€', '24,00', 24.00), + Example(None, '2 items at 24,00 €', + '€', '24,00', 24.00), + Example(None, '2 items at €24,00', + '€', '24,00', 24.00), + Example(None, '2 items at € 24,00', + '€', '24,00', 24.00), + Example(None, '2 items at 24,00€ or 30,00€', + '€', '24,00', 24.00), + Example(None, '2 items at 24,00€ or 30,00 €', + '€', '24,00', 24.00), + Example(None, '2 items at 24,00€ or €30,00', + '€', '24,00', 24.00), + Example(None, '2 items at 24,00€ or € 30,00', + '€', '24,00', 24.00), + Example(None, '2 items at 24,00 € or 30,00€', + '€', '24,00', 24.00), + Example(None, '2 items at 24,00 € or 30,00 €', + '€', '24,00', 24.00), + Example(None, '2 items at 24,00 € or €30,00', + '€', '24,00', 24.00), + Example(None, '2 items at 24,00 € or € 30,00', + '€', '24,00', 24.00), + Example(None, '2 items at €24,00 or 30,00€', + '€', '24,00', 24.00), + Example(None, '2 items at €24,00 or 30,00 €', + '€', '24,00', 24.00), + Example(None, '2 items at €24,00 or €30,00', + '€', '24,00', 24.00), + Example(None, '2 items at €24,00 or € 30,00', + '€', '24,00', 24.00), + Example(None, '2 items at € 24,00 or 30,00€', + '€', '24,00', 24.00), + Example(None, '2 items at € 24,00 or 30,00 €', + '€', '24,00', 24.00), + Example(None, '2 items at € 24,00 or €30,00', + '€', '24,00', 24.00), + Example(None, '2 items at € 24,00 or € 30,00', + '€', '24,00', 24.00), ] PRICE_PARSING_EXAMPLES_XFAIL = [ # amount is picked as a price - Example('3 Ausgaben für nur 14,85 EUR', '3 Ausgaben für nur 14,85 EUR', - 'EUR', '14,85', 14.85), - Example(None, 'Buy Now - 2 Litre Was $120.00 Now $60.00', - '$', '60.00', 60), Example('Цена: уточняйте (мин. заказ: 1 )', 'Цена: уточняйте (мин. заказ: 1 )', None, None, None), Example(None, '50 - $2.00 100 - $2.75 400 - $4.50 1,000 - $9.00 2,000 - $17.00 3,000 - $24.00 10,000 - $75.00', @@ -1931,6 +1970,14 @@ def __eq__(self, other): Example('Cuneo', '61.858 L', # Romanian New Leu 'L', '61.858', 61858), + # no handling of price ranges + Example('There are 163 products.', 'From 26 to 50 €', + '€', '26', 26), + + # no handling of old-vs-new prices + Example(None, 'Buy Now - 2 Litre Was $120.00 Now $60.00', + '$', '60.00', 60), + # "р" / "руб" is detected as currency Example('>', 'См. цену в прайсе', None, None, None), @@ -1977,6 +2024,30 @@ def test_parsing(example: Example): assert parsed == example +@pytest.mark.parametrize( + "input_string,symbol", + ( + # no currency + ('', None), + ('1', None), + + # fictional currency + ('10 eddies', None), + + # cyrrency code + ('5 CNY', 'CNY'), + + # cyrrency name + ('5 euros', 'euro'), + + # currency symbol + ('$4', '$'), + ) +) +def test_extract_currency_symbol(input_string, symbol): + assert extract_currency_symbol(input_string, None) == symbol + + @pytest.mark.parametrize( "amount,amount_float", (