validate.py

"""
Validate JSON to ensure that patterns all work
"""
from __future__ import print_function

import json
import re
from collections import Counter
import datetime

from jsonschema import validate


JSON_SCHEMA = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "pattern": {"type": "string"}, # required
            "instances": {"type": "array"}, # required
            "url": {"type": "string"}, # optional
            "description": {"type": "string"}, # optional
            "addition_date": {"type": "string"}, # optional
            "depends_on": {"type": "array"} # allows an instance to match twice
        },
        "required": ["pattern", "instances"]
    }
}


def main():
    with open('crawler-user-agents.json') as f:
        json_data = json.load(f)

    # check format using JSON Schema
    validate(json_data, JSON_SCHEMA)

    # check for simple duplicates
    pattern_counts = Counter(entry['pattern'] for entry in json_data)
    for pattern, count in pattern_counts.most_common():
        if count > 1:
            raise ValueError('Pattern {!r} appears {} times'.format(pattern,
                                                                    count))

    # check for duplicates with different capitalization
    pattern_counts = Counter(entry['pattern'].lower() for entry in json_data)
    for pattern, count in pattern_counts.most_common():
        if count > 1:
            raise ValueError('Pattern {!r} is duplicated {} times with different capitalization'
                             .format(pattern, count))
    
    # checks that no pattern contains unescaped slash / 
    for entry in json_data:
        pattern = entry['pattern']
        if re.search('[^\\\\]/', pattern):
            raise ValueError('Pattern {!r} has an unescaped slash character'.format(pattern))

    # check that no pattern contains unescaped dot .
    for entry in json_data:
        pattern = entry['pattern']
        if re.search('[^\\\\]\\.', pattern):
            raise ValueError('Pattern {!r} has an unescaped dot character'.format(pattern))

    # check that we match the given instances
    num_instances = 0
    for entry in json_data:
        pattern = entry['pattern']
        
        # assert that field "addition_date" has format "2019/12/23",
        if 'addition_date' in entry:
            if not re.match(r'\d{4}/\d{2}/\d{2}', entry['addition_date']):
                raise ValueError('addition_date {!r} has invalid format'.format(entry['addition_date']))
            # parse the date with datetime
            datetime.datetime.strptime(entry['addition_date'], '%Y/%m/%d')
            
        # canonicalize entry
        if 'depends_on' not in entry: entry['depends_on'] = []
            
        # check that we have only the rights properties (not handled by default in module jsonschema)
        assert set([str(x) for x in entry.keys()]).issubset(set(JSON_SCHEMA['items']['properties'].keys())), "the entry contains unknown properties"  
        instances = entry.get('instances')
        if instances:
            # check that there is no duplicate
            if not len(instances) == len(set(instances)):
                raise Exception("duplicate instances in "+pattern)
            for instance in instances:
                num_instances += 1
                if not re.search(pattern, instance):
                    raise ValueError('Pattern {!r} misses instance {!r}'
                                     .format(pattern, instance))
                

    # Make sure we have at least one pattern
    if len(json_data) < 1:
        raise Exception("no pattern")

    # Check for patterns that match other patterns
    for entry1 in json_data:
        for entry2 in json_data:
            if entry1 != entry2 and re.search(entry1['pattern'],
                                              entry2['pattern'],re.IGNORECASE):
                raise ValueError('Pattern {!r} is a subset of {!r}'
                                 .format(entry2['pattern'], entry1['pattern']))

    print('Validation passed')


if __name__ == '__main__':
    main()