Skip to content

Commit

Permalink
Index (work in progress)
Browse files Browse the repository at this point in the history
  • Loading branch information
sirex committed Oct 30, 2015
1 parent 7f6ca96 commit cd03ee6
Show file tree
Hide file tree
Showing 26 changed files with 221 additions and 68 deletions.
68 changes: 53 additions & 15 deletions botlib/indexfinder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import collections
import argparse
import itertools
import gramtool

from databot.commands import CommandsManager, Command
from botlib.combinations import combinations, strjoin
Expand All @@ -11,6 +12,13 @@
norm_re = re.compile(r'\W+', re.UNICODE)


FLAGS = {
'title': str.title,
'lemma': lambda v: (gramtool.get_lemma(v) or v),
'genitive': lambda v: (gramtool.change_form(v, case='genitive') or v),
}


def norm(value):
return norm_re.sub(' ', value).strip().lower()

Expand Down Expand Up @@ -47,20 +55,24 @@ def create_aliases(self, index):
else:
index_aliases[name][norm(alias)] = choice
if patterns:
index_patterns[name].append((choice, patterns))
index_patterns[name].append((self.parse_patterns(choice), patterns))
return index_aliases, index_patterns

def parse_expr(self, expr):
name, flags = expr.split(':', 1) if ':' in expr else (expr, '')
flags = tuple(filter(None, map(str.strip, flags.split(','))))
for flag in flags:
if flag not in FLAGS:
raise ValueError("Unknown flag '%s' in '%s' expression." % (flag, expr))
return name, flags

def parse_patterns(self, value):
result = []
for token in pattern_re.split(value):
token = token.strip()
token = token
if token.startswith('{'):
name, flags = self.parse_expr(token[1:-1])
name = int(name) if name.isnumeric() else name
result.append((name, flags))
elif token:
result.append(token)
Expand Down Expand Up @@ -138,11 +150,13 @@ def pattern_finder(self, patterns, value, stack=None):

# First check all raw strings, if at least one raw string does not match, skip.
for i, (token, pattern) in enumerate(zip(comb, patterns)):
if token == pattern:
choices[i].append(token)
elif isinstance(pattern, str):
skip = True
break
if isinstance(pattern, str):
pattern = pattern.strip()
if token == pattern:
choices[i].append(token)
else:
skip = True
break
if skip:
continue

Expand All @@ -151,6 +165,7 @@ def pattern_finder(self, patterns, value, stack=None):
if isinstance(pattern, tuple):
appended = False
name, flags = pattern
token = self.handle_flags(token, flags)
if (name, token) not in stack:
for item in self.find(name, token, stack | {(name, token)}):
choices[i].append(item)
Expand All @@ -171,7 +186,34 @@ def pattern_to_str(self, pattern):
result.append('{%s}' % expr)
else:
result.append(group)
return ' '.join(result)
return ''.join(result)

def replace(self, groups, replacement):
args = []
kwargs = {}
for (name, flags), (id, value, source) in groups:
args.append(value)
kwargs[name] = value

result = []
for token in replacement:
if isinstance(token, tuple):
name, flags = token
if isinstance(name, int):
value = args[name]
else:
value = kwargs[name]
value = self.handle_flags(value, flags)
result.append(value)
else:
result.append(token)

return ''.join(result)

def handle_flags(self, value, flags):
for flag in flags:
value = FLAGS[flag](value)
return value

def find(self, name, value, stack=None):
idx = self.index[name]
Expand All @@ -189,16 +231,12 @@ def find(self, name, value, stack=None):
for replacement, patterns in self.patterns[name]:
for pattern in patterns:
for groups in self.pattern_finder(pattern, value, stack):
source = '%s -> %s' % (self.pattern_to_str(pattern), replacement)
if replacement == '(import)':
source = '%s -> %s' % (self.pattern_to_str(pattern), self.pattern_to_str(replacement))
if replacement == ['(extends)']:
(((name, flags), (id, _value, _source)),) = groups
yield (id, _value, source)
else:
_value = replacement
groups = [(k, norm(v)) for (k, flags), (id, v, _source) in groups]
for i, (k, v) in enumerate(groups):
_value = _value.replace('{%d}' % i, v)
_value = _value.format(**dict(groups))
_value = self.replace(groups, replacement)
yield idx.get(norm(_value), (None, _value)) + (source,)


Expand Down
68 changes: 68 additions & 0 deletions index/README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
Classification used in this index
=================================

{person} - A living (or deceased) person.

{enterprise} - All form of enterprise companies.

{government} - Gevernmen institution or division.

{party} - Political party.

{ngo} - NGO (Non government organisation), basically everything that
is not enterprise and is not gevernment institution.

{preschool} - Kindergardens and other pre school institutions.

{school} - School.

{college} - College.

{university} - University.

{finance} - Financial institutions (banks, leasing).

{insurance} - Insurance and broker companies.

{individual} - Enterprice led by single person, individual company.

{forename} - First name.

{surname} - Last name.

{country} - List of countries.

{city} - List of cities.

{abbreviation} - List of abbreviations.

{agent} - Includes person, enterprice, government, party, ngo and
other agents, that can be represented as an acting body.

WordNet classification
======================

- social group
- gathering
- assembly
- legislative body
- parliament
- organisation
- social unit
- administrative body
- division
- department
- government-department
- ministry
- institution
- educational institution
- financial institution
- bank
- nondepository financial institution
- insurance company
- company (įmonė)
- enterprise (bendrovė)
- individual company (individuall įmonė)
- public company (viešojo sektoriaus įmonė)
- association
- political party
File renamed without changes.
File renamed without changes.
3 changes: 0 additions & 3 deletions index/actor/aliases.txt

This file was deleted.

13 changes: 13 additions & 0 deletions index/agent/aliases.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
(extends)
{person}
{enterprise}
{government}
{party}
{ngo}
{preschool}
{school}
{university}
{college}
{finance}
{insurance}
{individual}
File renamed without changes.
File renamed without changes.
14 changes: 0 additions & 14 deletions index/association/aliases.txt

This file was deleted.

17 changes: 0 additions & 17 deletions index/city/aliases.txt
Original file line number Diff line number Diff line change
@@ -1,17 +0,0 @@
Vilnius
Vilniaus

Kaunas
Kauno

Klaipėda
Klaipėdos

Šiauliai
Šiaulių

Kelmė
Kelmės

Jurbarkas
Jurbarko
File renamed without changes.
2 changes: 2 additions & 0 deletions index/company/choices.txt → index/enterprise/choices.txt
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,5 @@
58,Autoerdvė
59,Omen faustum
60,Eveleta
61,D. Jonylienės
62,L. Graužinienės
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Empty file added index/individual/choices.txt
Empty file.
17 changes: 17 additions & 0 deletions index/ngo/aliases.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{ngo}
aso {ngo}
asociacija {ngo}
{ngo} draugija
visuomeninė organizacija {ngo}
vo {ngo}
visuomeninis judėjimas {ngo}
visuomeninė organizacija {ngo} pirmininkas

{city:title,genitive} rotary klubas
{city:lemma} rotary klubas

Macierz szkolna
lietuvos lenkų mokyklų mokytojų draugija macierz szkolna

Mokslo ir enciklopedijų leidybos institutas
111959573
2 changes: 2 additions & 0 deletions index/association/choices.txt → index/ngo/choices.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@
1,Už pilietinę visuomenę
1,Konservatyvioji ateitis
1,Šviešos centras
1,Mokslo ir enciklopedijų leidybos institutas
1,Labdaros ir paramos fondas Išgirsk mane
9 changes: 0 additions & 9 deletions index/organisation/aliases.txt

This file was deleted.

2 changes: 0 additions & 2 deletions index/organisation/choices.txt

This file was deleted.

2 changes: 0 additions & 2 deletions index/sole-trader/choices.txt

This file was deleted.

1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
packages=find_packages(),
install_requires=[
'databot',
'gramtool',
],
entry_points={
'console_scripts': [
Expand Down
2 changes: 1 addition & 1 deletion tests/fixtures/index/bank/aliases.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
{bank} bankas
{bank} bank
bankas {bank}
{bank:genitive} lizingas
{bank:lemma} lizingas
Loading

0 comments on commit cd03ee6

Please sign in to comment.