我对 spacy ner 模型的输入数据位于BILUO
标记方案,我希望使用相同的作为某些要求的一部分。当我尝试在没有小批量的情况下简单地训练模型时,它工作得很好(注释部分)。但我无法弄清楚如何在这里使用 minibatch 和 GoldParse 来提高模型的准确性。我的期望是否有效,因为我找不到这种组合的单个示例?另外,我已经用开始、结束、标签格式的方法训练了模型。请帮我弄清楚这一部分。我的代码如下,
import spacy
from spacy.gold import offsets_from_biluo_tags
from spacy.gold import biluo_tags_from_offsets
import random
from spacy.util import minibatch, compounding
from os import path
from tqdm import tqdm
def train_spacy(data, iterations, model=None):
TRAIN_DATA = data
print(f"downloads = {model}")
if model is not None and path.exists(model):
print(f"training existing model")
nlp = spacy.load(model)
print("Model is Loaded '%s'" % model)
else:
print(f"Creating new model")
nlp = spacy.blank('en') # create blank Language class
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
else:
ner = nlp.get_pipe('ner')
# Based on template, get labels and save those for further training
LABEL = ["Name", "ORG"]
for i in LABEL:
# print(i)
ner.add_label(i)
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
if model is None:
optimizer = nlp.begin_training()
else:
optimizer = nlp.entity.create_optimizer()
tags = dict()
for itn in range(iterations):
print("Starting iteration " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
# for text, annotations in tqdm(TRAIN_DATA):
# print(f"text={text}, an={annotations}")
# tags['entities'] = offsets_from_biluo_tags(nlp(text), annotations)
# print(f"a={tags}")
# nlp.update([text], # batch of texts
# [tags], # batch of annotations
# drop=0.5, # dropout - make it harder to memorise data
# sgd=optimizer, # callable to update weights
# losses=losses)
# print(losses)
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 16.0, 1.001))
# type 2 with mini batch
for batch in batches:
texts, annotations = zip(*batch)
print(texts)
tags = {'entities': annotations}
nlp.update(
texts, # batch of texts
[tags], # batch of annotations
drop=0.4, # dropout - make it harder to memorise data
losses=losses,
sgd=optimizer
)
print(losses)
return nlp
data_biluo = [
('I am Shah Khan, I work in MS Co', ['O', 'O', 'B-Name', 'L-Name', 'O', 'O', 'O', 'B-ORG', 'L-ORG']),
('I am Tom Tomb, I work in Telecom Networks', ['O', 'O', 'B-Name', 'L-Name', 'O', 'O', 'O', 'B-ORG', 'L-ORG'])
]
model = train_spacy(data_biluo, 10)
model.to_disk('./Vectors/')