-
Notifications
You must be signed in to change notification settings - Fork 1
/
reg_exp_definder.py
87 lines (85 loc) · 4.73 KB
/
reg_exp_definder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import re
import json
import ftfy
def create_json(json_path, json_path1):
"""
Определение литеральных значений в ячейках
:param json_path: название исходного json-файла
:param json_path1: название файла, в котором будут определены литеральные значения в ячейках
:return:
"""
with open(json_path, 'r', encoding='utf-8') as f:
text = json.load(f)
i = -1
for str_json in text:
i = i + 1
for obj_json in str_json:
text[i][obj_json] = ftfy.fix_text(text[i][obj_json])
text_string = text[i][obj_json]
if text[i][obj_json] == 'NONE':
text[i][obj_json] = 'SYMBOL'
result = re.search('[A-Za-z0-9А-Яа-я]', text_string)
if result:
result = re.search(r'^[-+]?[0-9]+$', text_string)
if result:
text[i][obj_json] = 'INTEGER'
result = re.search('[0-2][0-9][0-9][0-9]', text_string)
if result:
text[i][obj_json] = 'DATE'
result = re.search(
'(0[1-9]|[12][0-9]|3[01])[- /.](0[1-9]|1[012])[- /.](19|20)\d\d|(19|20)\d\d-((0[1-9]|1[012])-(0[1-9]|[12]\d)|(0[13-9]|1[012])-30|(0[13578]|1[02])-31)',
text_string)
if result:
text[i][obj_json] = 'DATE'
result = re.search(
'^(0?[1-9]|1[0-2]):[0-5][0-9]$|((1[0-2]|0?[1-9]):([0-5][0-9]) ?([AaPp][Mm]))|^(0[0-9]|1[0-9]|2[0-3]):[0-5][0-9]$|^([0-9]|0[0-9]|1[0-9]|2[0-3]):[0-5][0-9]$|(?:[01]\d|2[0-3]):(?:[0-5]\d):(?:[0-5]\d)',
text_string)
if result:
text[i][obj_json] = 'TIME'
result = re.search('^"true|false|True|False|TRUE|FALSE"&', text_string)
if result:
text[i][obj_json] = 'LOGIC'
result = re.search('^\d{6}$', text_string)
if result:
text[i][obj_json] = 'MAIL'
result = re.search('^[-+]?([1-9]\d*|0)\\$|\\£|\\€', text_string)
if result:
text[i][obj_json] = 'CURRENCY'
result = re.search('^\d{5}(?:[-\s]\d{4})?$', text_string)
if result:
text[i][obj_json] = 'MAIL'
result = re.search('^[0-9]{4}-[0-9]{3}[0-9xX]$', text_string)
if result:
text[i][obj_json] = 'ISSN'
result = re.search('^(?:ISBN(?:: ?| ))?((?:97[89])?\d{9}[\dx])+$', text_string)
if result:
text[i][obj_json] = 'ISBN'
result = re.search('((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)', text_string)
if result:
text[i][obj_json] = 'IPv4'
result = re.search('((\\b100)|(\\b[0-9]{1,2}\\.?[0-9]?))(?=%| *percent)', text_string)
if result:
text[i][obj_json] = 'PERCENT'
result = re.search(r'^([456][0-9]{3})-?([0-9]{4})-?([0-9]{4})-?([0-9]{4})$', text_string)
if result:
text[i][obj_json] = 'CARD'
result = re.search(r'#[0-9A-Fa-f]{6}', text_string)
if result:
text[i][obj_json] = 'COLOR'
result = re.search(r'[\w.-]+@[\w.-]+\.?[\w]+?', text_string)
if result:
text[i][obj_json] = 'EMAIL'
result = re.search("[+-]?\d+\.\d+", text_string)
if result:
text[i][obj_json] = 'FLOAT'
if (text[i][obj_json] != 'INTEGER' and text[i][obj_json] != 'SYMBOL' and text[i][obj_json] != 'DATE' and
text[i][obj_json] != 'TIME' and text[i][obj_json] != 'LOGIC' and text[i][obj_json] != 'MAIL' and
text[i][obj_json] != 'CURRENCY' and text[i][obj_json] != 'ISSN' and text[i][
obj_json] != 'ISBN' and
text[i][obj_json] != 'IPv4' and text[i][obj_json] != 'IPv6' and text[i][
obj_json] != 'PERCENT' and
text[i][obj_json] != 'CARD' and text[i][obj_json] != 'COLOR' and text[i][
obj_json] != 'EMAIL' and
text[i][obj_json] != 'FLOAT'):
text[i][obj_json] = 'NONE'
return json_path1, text