import os
import re
import glob
import json
from pymorphy2 import MorphAnalyzer
from pymystem3 import Mystem
import russian_tagsets
# pymorphy2
morph = MorphAnalyzer(lang='ru')
morph_noprob = MorphAnalyzer(lang='ru', probability_estimator_cls=None)
# mystem
mystem = Mystem(disambiguation=False, grammar_info=True)
mystem._mystemargs.remove('-gi')
mystem._mystemargs.remove('-c')
mystem._mystemargs += ['-i', '--eng-gr']
Участвует 3 тегсета: OpenCorpora (pymorphy2 и часть размеченных данных), НКРЯ (часть размеченных данных) и mystem.
Библиотека russian-tagsets умеет преобразовывать из формата OpenCorpora в формат НКРЯ. Из НКРЯ все приводим к формату mystem.
_pym2ruscorpora = russian_tagsets.converters.converter('opencorpora-int', 'ruscorpora')
def ruscorpora2mystem(tag):
""" Convert ruscorpora.ru tag to mystem tag """
tag = tag.replace('-', '').replace('zoon', 'persn')
tag = tag.replace('loc2', 'LOC').replace('loc', 'abl').replace('LOC', 'loc')
tag = tag.replace('fut', 'inpraes') # ~sort of
tag = tag.replace('gen2', 'part')
tag = tag.replace('PARENTH', 'parenth').replace('PRAEDIC', 'praed')
return tag
def py2mystem(tag):
""" Convert pymorphy2 tag to mystem tag. """
tag = _pym2ruscorpora(str(tag))
return ruscorpora2mystem(tag)
_tag2grammemes = re.compile('[,=]').split
tag2grammemes = lambda tag: _tag2grammemes(tag)
def mystem_analyze(token):
"""
Analyze a single token using mystem.
Return None if mystem analyzes the token as multiple tokens.
"""
res = mystem.analyze(token)
if res[0]['text'] != token:
return None
result = res[0]['analysis']
for p in result:
if 'gr' in p:
p['gr'] = p['gr'].rstrip('=')
return result
def pymorphy2_analyze(token, prob=True):
m = morph if prob else morph_noprob
return [
{'gr': py2mystem(p.tag), 'lex': p.normal_form}
for p in m.parse(token)
]
mystem_analyze('друзьях')
[{'gr': 'S,m,anim=abl,pl', 'lex': 'друг'}]
pymorphy2_analyze('друзьях')
[{'gr': 'S,anim,m=pl,abl', 'lex': 'друг'}]
Для оценки качества собран корпус из 2 частей:
def read_microcorpus_file(path):
with open(path, 'rt', encoding='utf8') as f:
sent = [line.split(' ', 1) for line in f if line.strip()]
return [(tok.strip(), tag.strip()) for tok, tag in sent]
def read_ruscorpora_json(path):
with open(path, 'rt', encoding='utf8') as f:
return json.load(f)
sents_microcorpus_src = [
read_microcorpus_file(path)
for path in glob.glob('./microcorpus-done/*.txt')
]
sents_ruscorpora_src = read_ruscorpora_json('./ruscorpora-100-fixed.json')
sents_microcorpus = [
[(tok, py2mystem(tag)) for tok, tag in sent]
for sent in sents_microcorpus_src
]
sents_ruscorpora = [
[(tok, ruscorpora2mystem(tag)) for tok, tag in sent]
for sent in sents_ruscorpora_src
]
sents = sents_microcorpus + sents_ruscorpora
print("microcorpus: %d sents; ruscorpora: %d sents" % (len(sents_microcorpus), len(sents_ruscorpora)))
microcorpus: 100 sents; ruscorpora: 100 sents
Пунктуация, цифры и латинские слова не учитываются.
def to_tokens(sents):
tokens = [(tok, tag2grammemes(tag)) for sent in sents for (tok, tag) in sent]
tokens = [(tok, gr) for tok, gr in tokens if not (set(gr) & {'PNCT', 'NONLEX', 'ciph'})]
return tokens
tokens = to_tokens(sents)
tokens_microcorpus = to_tokens(sents_microcorpus)
tokens_ruscorpora = to_tokens(sents_ruscorpora)
print(
"Total tokens: %d (%d microcorpus + %d ruscorpora)" % (
len(tokens), len(tokens_microcorpus), len(tokens_ruscorpora))
)
Total tokens: 2498 (1405 microcorpus + 1093 ruscorpora)
Учитываются теги целиком - оценивается качество полного морфологического анализа.
При этом из-за того, что приходится преобразовывать 3 различных набора тегов друг в друга, некоторые различия различиями не считаются - различия могут быть вызваны неточностью преобразования тегов или разными подходами к разметке. Чтоб понять, зачем каждое из условий, можно его закомментировать и посмотреть, какие появятся дополнительные несоответствия в разборах.
def _gram(p):
""" Extract grammemes from a parse result. """
if isinstance(p, dict):
return tag2grammemes(p['gr'])
if isinstance(p, str):
return tag2grammemes(p)
return p
def tags_diff(t1, t2):
"""
Return a set of grammemes which are different between t1 and t2,
taking conversion issues in account.
"""
gr1 = set(_gram(t1))
gr2 = set(_gram(t2))
diff = gr1 ^ gr2
comb = gr1 | gr2
common = gr1 & gr2
diff -= {'anim', 'inan', 'persn', 'famn', '0', 'obsol', 'geo', 'distort', 'med', 'act', 'plen'}
if diff == {'ADV'} and ({'parenth', 'praed'} & comb):
return {}
if diff == {'PART'} and 'parenth' in comb:
return {}
if diff == {'parenth'}:
return {}
if diff == {'CONJ', 'parenth'}:
return {}
if diff == {'inpraes', 'praes'} and 'ipf' in comb:
return {}
if diff == {'fut', 'inpraes', 'ipf'}:
return {}
if diff == {'tran'} or diff == {'inpraes', 'praes', 'tran'}:
return {}
if diff == {'ipf'}:
return {}
if 'S' in diff and 'INIT' in diff and 'abbr' in common:
return {}
if diff == {'SPRO', 'APRO'}:
return {}
if 'SPRO' in common:
return {}
if 'APRO' in common:
return {}
if diff == {'A', 'NUM'}:
return {}
if diff == {'praed', 'ADV'}:
return {}
if diff == {'praed', 'A'}:
return {}
if diff == {'APRO', 'ANUM', 'sg'}:
return {}
if diff == {'ADV', 'ADVPRO'}:
return {}
if diff == {'A', 'ADV'} and 'comp' in common:
return {}
if diff == {'A', 'pl', 'brev', 'ADV'}:
return {}
if diff == {'mf', 'm'}:
return {}
if diff == {'abbr'} or diff == {'abbr', 'f'}:
return {}
if diff == {'f'} or diff == {'m'} and 'abbr' in common:
return {}
return diff
def tags_match(t1, t2):
"""
Return True if t1 and t2 tags are the same
(taking in account tagset conversion issues).
"""
return not tags_diff(t1, t2)
def has_correct(correct, parses):
if parses is None:
# mystem can't parse most hyphenated words as a single token;
# don't consider it an error
return True
for p in parses:
if tags_match(p, correct):
return True
return False
def is_bad(correct, parses):
return not has_correct(correct, parses)
Если среди предложенных морфологическим анализатором вариантов нет совпадающего с правильным, то разбор считается неправильным.
Это предварительный шаг - все ошибки будут потом еще раз проверены вручную.
Примечание: в выборке из НКРЯ было найдено 6 ошибок на 100 предложений; вот что устранено:
pymorphy2_errors = [(tok, gr) for tok, gr in tokens if is_bad(gr, pymorphy2_analyze(tok))]
mystem_errors = [(tok, gr) for tok, gr in tokens if is_bad(gr, mystem_analyze(tok))]
py_err = len(pymorphy2_errors)
my_err = len(mystem_errors)
print("pymorphy2: %d errors ==> %0.1f%% has correct results" % (py_err, 100*(1-py_err/len(tokens))))
print("mystem: %d errors ==> %0.1f%% has correct results" % (my_err, 100*(1-my_err/len(tokens))))
print("Note: not all errors are real errors; see below")
pymorphy2: 21 errors ==> 99.2% has correct results mystem: 28 errors ==> 98.9% has correct results Note: not all errors are real errors; see below
pymorphy2_errors
[('Юнг', ['S', 'anim', 'm', 'famn', 'sg', 'nom']), ('ВОВ', ['S', 'inan', 'f', '0', 'abbr', 'sg', 'gen']), ('т', ['PART', 'abbr']), ('г', ['S', 'inan', 'm', '0', 'abbr', 'sg', 'loc']), ('Малхолланда', ['S', 'anim', 'm', 'famn', 'sg', 'gen']), ('св', ['A', 'plen', 'abbr', 'm', 'sg', 'gen']), ('Дыа', ['PART', 'distort']), ('дыа', ['PART', 'distort']), ('Диана', ['S', 'anim', 'mf', '0', 'famn', 'sg', 'nom']), ('Та-а-ак', ['PART', 'distort']), ('ПРО', ['S', 'inan', 'f', '0', 'abbr', 'sg', 'nom']), ('МБП', ['S', 'n', 'inan', '0', 'sg', 'gen']), ('полгода', ['S', 'm', 'inan', 'sg', 'acc']), ('НПФ', ['S', 'm', 'inan', '0', 'pl', 'acc']), ('также', ['PART']), ('фитнесса', ['S', 'm', 'inan', 'sg', 'gen']), ('Слуцкер', ['S', 'famn', 'f', 'anim', 'sg', 'ins']), ('направленные', ['V', 'pf', 'partcp', 'praet', 'pass', 'pl', 'acc', 'intr']), ('полвека', ['S', 'm', 'inan', 'sg', 'acc']), ('СБ', ['S', 'm', 'inan', '0', 'sg', 'gen']), ('ОМУ', ['S', 'n', 'inan', '0', 'sg', 'gen'])]
mystem_errors
[('т', ['APRO', 'n', 'sg', 'dat', 'abbr']), ('п', ['A', 'plen', 'n', 'sg', 'nom', 'abbr']), ('т', ['PART', 'abbr']), ('е', ['V', 'ipf', 'intr', '0', 'abbr', 'sg', '3p', 'praes', 'indic']), ('Донского', ['S', 'anim', 'm', 'famn', 'sg', 'gen']), ('г', ['S', 'inan', 'm', '0', 'abbr', 'sg', 'loc']), ('св', ['A', 'plen', 'abbr', 'm', 'sg', 'gen']), ('млн', ['S', 'inan', 'm', '0', 'abbr', 'pl', 'gen']), ('Прожекторперисхилтон', ['S', 'inan', 'm', 'sg', 'acc']), ('Дыа', ['PART', 'distort']), ('дыа', ['PART', 'distort']), ('Диана', ['S', 'anim', 'mf', '0', 'famn', 'sg', 'nom']), ('скорее', ['parenth']), ('снарягу', ['S', 'inan', 'f', 'sg', 'acc']), ('ПРО', ['S', 'inan', 'f', '0', 'abbr', 'sg', 'nom']), ('проживающие', ['V', 'partcp', 'plen', 'ipf', 'intr', 'praes', 'act', 'pl', 'nom']), ('проживающего', ['V', 'partcp', 'plen', 'ipf', 'intr', 'praes', 'act', 'n', 'sg', 'gen']), ('проживающие', ['V', 'partcp', 'plen', 'ipf', 'intr', 'praes', 'act', 'pl', 'nom']), ('МБП', ['S', 'n', 'inan', '0', 'sg', 'gen']), ('т', ['ADVPRO', 'abbr']), ('д', ['ADV', 'abbr']), ('полгода', ['S', 'm', 'inan', 'sg', 'acc']), ('НПФ', ['S', 'm', 'inan', '0', 'pl', 'acc']), ('изменяет', ['V', 'ipf', 'intr', 'act', 'sg', 'praes', '3p', 'indic']), ('направленные', ['V', 'pf', 'partcp', 'praet', 'pass', 'pl', 'acc', 'intr']), ('полвека', ['S', 'm', 'inan', 'sg', 'acc']), ('СБ', ['S', 'm', 'inan', '0', 'sg', 'gen']), ('ОМУ', ['S', 'n', 'inan', '0', 'sg', 'gen'])]
pymorphy2:
mystem:
pymorphy2: 10+9=19 (или 7+9=16 без учета сокращений)
mystem: 15+8=23 (или 8+6=14 без учета сокращений)
В ручной разметке НКРЯ (случайная выборка из 100 предложений, какая-то старая выгрузка) было 6 ошибок.
** ==ЭТО ТОЛЬКО НАБРОСОК== **
pymorphy2 умеет снимать неоднозначность на уровне отдельных слов (без учета контекста). Набросок кода для оценки качества (набросок, т.к. там, скорее всего, всякие ошибки из-за преобразования тегов из одного тегсета в другой):
def POS_match(t1, t2):
# FIXME: code is a copy-paste of tags_diff with minor variations
gr1, gr2 = _gram(t1), _gram(t2)
pos1, pos2 = gr1[0], gr2[0]
if pos1 == pos2:
return True
diff = set(gr1) ^ set(gr2)
comb = set(gr1) | set(gr2)
common = set(gr1) & set(gr2)
diff -= {'anim', 'inan', 'persn', 'famn', '0', 'obsol', 'geo', 'distort', 'med', 'act', 'plen'}
if not diff:
return True
if diff == {'ADV'} and ({'parenth', 'praed'} & comb):
return True
if diff == {'PART'} and 'parenth' in comb:
return True
if 'S' in diff and 'INIT' in diff and 'abbr' in common:
return True
if diff == {'CONJ', 'parenth'}:
return True
if diff == {'SPRO', 'APRO'}:
return True
if diff == {'A', 'NUM'}:
return True
if diff == {'APRO', 'ANUM', 'sg'}:
return True
if diff == {'A', 'ADV'} and 'comp' in common:
return True
if 'SPRO' in common:
return True
if diff == {'ADV', 'ADVPRO'}:
return True
if diff == {'A', 'pl', 'brev', 'ADV'}:
return True
if diff == {'praed', 'ADV'}:
return True
if diff == {'praed', 'A'}:
return True
return False
def first_correct(correct, parses):
if parses is None:
return True
if not parses:
return False
return tags_match(parses[0], correct)
def first_POS_correct(correct, parses):
if parses is None:
return True
if not parses:
return False
return POS_match(parses[0], correct)
pymorphy2_disambig_errors = [
(tok, gr) for tok, gr in tokens
if not first_correct(gr, pymorphy2_analyze(tok))
]
pymorphy2_noprob_disambig_errors = [
(tok, gr) for tok, gr in tokens
if not first_correct(gr, pymorphy2_analyze(tok, prob=False))
]
pymorphy2_POS_disambig_errors = [
(tok, gr) for tok, gr in tokens
if not first_POS_correct(gr, pymorphy2_analyze(tok))
]
pymorphy2_noprob_POS_disambig_errors = [
(tok, gr) for tok, gr in tokens
if not first_POS_correct(gr, pymorphy2_analyze(tok, prob=False))
]
def perc_txt(errors):
percent = 100 - (len(errors) / len(tokens) * 100)
return "%0.1f%%" % percent
print("pymorphy2 context-unaware disambiguation, % of correct analyses\n")
print("no P(tag|word): %s (full tagset), %s (POS only)" % (
perc_txt(pymorphy2_noprob_disambig_errors),
perc_txt(pymorphy2_noprob_POS_disambig_errors)))
print("with P(tag|word): %s (full tagset), %s (POS only)" % (
perc_txt(pymorphy2_disambig_errors),
perc_txt(pymorphy2_POS_disambig_errors)))
pymorphy2 context-unaware disambiguation, % of correct analyses no P(tag|word): 72.5% (full tagset), 86.1% (POS only) with P(tag|word): 81.7% (full tagset), 93.7% (POS only)