Skip to content

Commit

Permalink
Prioritize numbers next to currencies
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio committed Jun 24, 2019
1 parent 9cf1888 commit e44e242
Show file tree
Hide file tree
Showing 2 changed files with 137 additions and 22 deletions.
76 changes: 60 additions & 16 deletions price_parser/parser.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
import re
import string
from typing import Callable, Optional, Pattern, List, Tuple
from typing import Callable, Match, Optional, Pattern, List, Tuple
from decimal import Decimal, InvalidOperation

import attr
Expand Down Expand Up @@ -36,11 +36,17 @@ def fromstring(cls, price: Optional[str],
``price`` string, it could be **preferred** over a value extracted
from ``currency_hint`` string.
"""
amount_text = extract_price_text(price) if price is not None else None
currency_match, source = _extract_currency_symbol(price, currency_hint)
if price is not None:
_currency_match = currency_match if source == price else None
amount_text = extract_price_text(price, _currency_match)
else:
amount_text = None
amount_num = parse_number(amount_text) if amount_text is not None else None
currency = extract_currency_symbol(price, currency_hint)
if currency is not None:
currency = currency.strip()
if currency_match is not None:
currency = currency_match.group(0).strip()
else:
currency = None
return Price(
amount=amount_num,
currency=currency,
Expand Down Expand Up @@ -120,11 +126,11 @@ def or_regex(symbols: List[str]) -> Pattern:
_search_unsafe_currency = or_regex(OTHER_CURRENCY_SYMBOLS).search


def extract_currency_symbol(price: Optional[str],
currency_hint: Optional[str]) -> Optional[str]:
def _extract_currency_symbol(price: Optional[str], currency_hint: Optional[str]) -> Tuple[Optional[Match], Optional[str]]:
"""
Guess currency symbol from extracted price and currency strings.
Return an empty string if symbol is not found.
Guess the currency symbol from extracted price and currency strings.
Return a (`match object`_, source_string) tuple with the symbol found and
the string where it was found, or (None, None) if no symbol is found.
"""
methods: List[Tuple[Callable, Optional[str]]] = [
(_search_safe_currency, price),
Expand All @@ -142,17 +148,32 @@ def extract_currency_symbol(price: Optional[str],
for meth, attr in methods:
m = meth(attr) if attr else None
if m:
return m.group(0)
return m, attr

return None, None


def extract_currency_symbol(price: Optional[str],
currency_hint: Optional[str]) -> Optional[str]:
"""
Guess currency symbol from extracted price and currency strings.
Return the symbol as found as a string, or None if no symbol is found.
"""
match, _ = _extract_currency_symbol(price, currency_hint)
if match:
return match.group(0)
return None


def extract_price_text(price: str) -> Optional[str]:
def extract_price_text(price: str, currency_match: Optional[Match] = None) -> Optional[str]:
"""
Extract text of a price from a string which contains price and
maybe some other text. If multiple price-looking substrings are present,
the first is returned (FIXME: it is better to return a number
which is near a currency symbol).
maybe some other text.
If a match object of the currency within the `price` string is provided,
amounts before or after the matched currency substring are prioritized.
Otherwise, if multiple price-looking substrings are present, the first is
returned.
>>> extract_price_text("price: $12.99")
'12.99'
Expand Down Expand Up @@ -189,16 +210,39 @@ def extract_price_text(price: str) -> Optional[str]:
""", price, re.VERBOSE)
if m:
return m.group(0).replace(' ', '')

def number_from_match(m):
return m.group(1).strip(',.').strip()

if currency_match is not None:

m = re.search(r"""
(\d[\d\s.,]*) # number, probably with thousand separators
\s*$ # only match right before the currency symbol
""", price[:currency_match.start(0)], re.VERBOSE)
if m:
return number_from_match(m)

m = re.search(r"""
^\s* # only match right after the currency symbol
(\d[\d\s.,]*) # number, probably with thousand separators
\s* # skip whitespace
(?:[^%\d]|$) # capture next symbol - it shouldn't be %
""", price[currency_match.end(0):], re.VERBOSE)
if m:
return number_from_match(m)

m = re.search(r"""
(\d[\d\s.,]*) # number, probably with thousand separators
\s* # skip whitespace
(?:[^%\d]|$) # capture next symbol - it shouldn't be %
""", price, re.VERBOSE)

if m:
return m.group(1).strip(',.').strip()
return number_from_match(m)

if 'free' in price.lower():
return '0'

return None


Expand Down
83 changes: 77 additions & 6 deletions tests/test_price_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import pytest

from price_parser import Price
from price_parser.parser import extract_currency_symbol


class Example(Price):
Expand Down Expand Up @@ -625,8 +626,6 @@ def __eq__(self, other):
'Р', '30', 30),
Example('€', '€ 139.00',
'€', '139.00', 139),
Example('There are 163 products.', 'From 26 to 50 €',
'€', '26', 26),
Example('Pris NOK 1 999,00', '139,00',
'NOK', '139,00', 139),
Example('/sqft', '1.52',
Expand Down Expand Up @@ -1909,15 +1908,55 @@ def __eq__(self, other):
'CHF', '19.90', 19.90),
Example('', '530,42 Zł',
'Zł', '530,42', 530.42),

# Prefer values next to currency symbols
Example('3 Ausgaben für nur 14,85 EUR', '3 Ausgaben für nur 14,85 EUR',
'EUR', '14,85', 14.85),
Example(None, '2 items at 24,00€',
'€', '24,00', 24.00),
Example(None, '2 items at 24,00 €',
'€', '24,00', 24.00),
Example(None, '2 items at €24,00',
'€', '24,00', 24.00),
Example(None, '2 items at € 24,00',
'€', '24,00', 24.00),
Example(None, '2 items at 24,00€ or 30,00€',
'€', '24,00', 24.00),
Example(None, '2 items at 24,00€ or 30,00 €',
'€', '24,00', 24.00),
Example(None, '2 items at 24,00€ or €30,00',
'€', '24,00', 24.00),
Example(None, '2 items at 24,00€ or € 30,00',
'€', '24,00', 24.00),
Example(None, '2 items at 24,00 € or 30,00€',
'€', '24,00', 24.00),
Example(None, '2 items at 24,00 € or 30,00 €',
'€', '24,00', 24.00),
Example(None, '2 items at 24,00 € or €30,00',
'€', '24,00', 24.00),
Example(None, '2 items at 24,00 € or € 30,00',
'€', '24,00', 24.00),
Example(None, '2 items at €24,00 or 30,00€',
'€', '24,00', 24.00),
Example(None, '2 items at €24,00 or 30,00 €',
'€', '24,00', 24.00),
Example(None, '2 items at €24,00 or €30,00',
'€', '24,00', 24.00),
Example(None, '2 items at €24,00 or € 30,00',
'€', '24,00', 24.00),
Example(None, '2 items at € 24,00 or 30,00€',
'€', '24,00', 24.00),
Example(None, '2 items at € 24,00 or 30,00 €',
'€', '24,00', 24.00),
Example(None, '2 items at € 24,00 or €30,00',
'€', '24,00', 24.00),
Example(None, '2 items at € 24,00 or € 30,00',
'€', '24,00', 24.00),
]


PRICE_PARSING_EXAMPLES_XFAIL = [
# amount is picked as a price
Example('3 Ausgaben für nur 14,85 EUR', '3 Ausgaben für nur 14,85 EUR',
'EUR', '14,85', 14.85),
Example(None, 'Buy Now - 2 Litre Was $120.00 Now $60.00',
'$', '60.00', 60),
Example('Цена: уточняйте (мин. заказ: 1 )', 'Цена: уточняйте (мин. заказ: 1 )',
None, None, None),
Example(None, '50 - $2.00 100 - $2.75 400 - $4.50 1,000 - $9.00 2,000 - $17.00 3,000 - $24.00 10,000 - $75.00',
Expand All @@ -1931,6 +1970,14 @@ def __eq__(self, other):
Example('Cuneo', '61.858 L', # Romanian New Leu
'L', '61.858', 61858),

# no handling of price ranges
Example('There are 163 products.', 'From 26 to 50 €',
'€', '26', 26),

# no handling of old-vs-new prices
Example(None, 'Buy Now - 2 Litre Was $120.00 Now $60.00',
'$', '60.00', 60),

# "р" / "руб" is detected as currency
Example('>', 'См. цену в прайсе',
None, None, None),
Expand Down Expand Up @@ -1977,6 +2024,30 @@ def test_parsing(example: Example):
assert parsed == example


@pytest.mark.parametrize(
"input_string,symbol",
(
# no currency
('', None),
('1', None),
# fictional currency
('10 eddies', None),
# cyrrency code
('5 CNY', 'CNY'),
# cyrrency name
('5 euros', 'euro'),
# currency symbol
('$4', '$'),
)
)
def test_extract_currency_symbol(input_string, symbol):
assert extract_currency_symbol(input_string, None) == symbol


@pytest.mark.parametrize(
"amount,amount_float",
(
Expand Down

0 comments on commit e44e242

Please sign in to comment.