我想使用 Bert 语言模型来训练多类文本分类任务。
之前我使用 LSTM 进行训练,没有任何错误,但 Bert 给了我这个错误。
我收到以下错误,我真的不知道如何解决它,有人可以帮助我吗?
不幸的是,keras 库中使用 BERT 的文档很少。
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py
import tensorflow_hub as hub
from bert import tokenization
module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(module_url, trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)
def bert_encode(texts, tokenizer, max_len=512):
all_tokens = []
all_masks = []
all_segments = []
for text in texts:
text = tokenizer.tokenize(text)
text = text[:max_len-2]
input_sequence = ["[CLS]"] + text + ["[SEP]"]
pad_len = max_len - len(input_sequence)
tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
pad_masks = [1] * len(input_sequence) + [0] * pad_len
segment_ids = [0] * max_len
all_tokens.append(tokens)
all_masks.append(pad_masks)
all_segments.append(segment_ids)
return np.array(all_tokens), np.array(all_masks), np.array(all_segments)
def build_model(bert_layer, max_len=512):
input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
clf_output = sequence_output[:, 0, :]
net = tf.keras.layers.Dense(64, activation='softmax')(clf_output)
net = tf.keras.layers.Dropout(0.2)(net)
net = tf.keras.layers.Dense(32, activation='softmax')(net)
net = tf.keras.layers.Dropout(0.2)(net)
out = tf.keras.layers.Dense(3, activation='softmax')(net)
model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
return model
max_len = 150
train_input = bert_encode(data.text_cleaned, tokenizer, max_len=max_len)
错误如下:
UnparsedFlagAccessError Traceback (most recent call last)
<ipython-input-175-fd64df42591d> in <module>()
1 import sys
2 max_len = 150
----> 3 train_input = bert_encode(o.text_cleaned, tokenizer, max_len=max_len)
4 frames
/usr/local/lib/python3.7/dist-packages/absl/flags/_flagvalues.py in __getattr__(self, name)
496 # get too much noise.
497 logging.error(error_message)
--> 498 raise _exceptions.UnparsedFlagAccessError(error_message)
499
500 def __setattr__(self, name, value):
UnparsedFlagAccessError: Trying to access flag --preserve_unused_tokens before flags were parsed.