最近我的导师去了解一下华为的MindSpore框架,觉得很有意思然后就让我跑一下他的文本生成实验,不过其中有很多是他的实验手册没怎么写清楚的点。我在这里为各位一个一个排坑拉~
#本实验都是来源于华为的文本生成实验手册,只是里面很多内容写的不清楚我这边给实验者排除会遇到的坑
这里很多人去找的话会直接找到OBS Brower但是注意,其实是OBS Brower+,是一个软件来的,需要下载并安装到自己的电脑上,当时我觉得安装麻烦不装了,结果走了很多弯路。 正确的是 如下图所示:而且最好是点击教你登录,然后先在OBS中创建一个桶,然后再在那个桶中拿AccessKey,当时这里也搞了我很长时间
登录之后就可以创建obs桶了!
进入之后创建一个obs桶,用于存放实验所需的文件。然后在这个桶路径下,我们创建一个 “textgeneration”的目录用于本实验数据的存放:
创建完如下所示:
下载链接:https://ascend-professional-construction-dataset.obs.cn-north-4.myhuaweicloud.com/NLP/NLP.zip
并在《自然语言处理》中的“实验指导书”-“文本生成 ”模块下,获取data和src文件。
进入刚创建的“textgeneration”文件夹,上传源码及数据至该目录下。直接将需要上传的文件和文件夹拖到该目录下即可。
然后就是 进入Modelarts,选择Notebook
在华为云主页搜索Modelarts并点击“进入控制台”,或者通过以下网址进入:
https://console.huaweicloud.com/modelarts/?region=cn-north-4#/dashboard
点击左侧导航栏的“开发环境”,选择“Notebook”。
点击创建按钮来创建一个新的Notebook,选择如下配置:
名称:建议使用textgeneration。
工作环境:选择tensorflow1.15-mindspore1.5.1-cann5.0.3-euler2.8-aarch64
规格:Ascend: 1*Ascend910|CPU:24核96GB
存储配置:默认存储
然后就立刻创建提交(有一个小时的先用时间的,大伙记住了,不用的时候一定要暂停使用,这里我可是花了20块钱大洋的)
当Notebook状态变为“运行中”时,点击右侧“打开”按钮打开Notebook。选择右侧“MindSpore-python3.7-aarch64”按钮,进入Notebook实验环境。
然后就是正式开始实验了
##########################################################################
进行实验前,我们需要将obs上的文件下载至容器本地环境中,此处需将obs桶名称换成自己创建的obs桶名称。
输入:
import moxing as mox
mox.file.copy_parallel(src_url="s3://替换你自己的obs路径/textgeneration/data/", dst_url='./data/')
mox.file.copy_parallel(src_url="s3://替换你自己的obs路径/textgeneration/src/", dst_url='./src/')
举个例子:比如我的是
import moxing as mox
mox.file.copy_parallel(src_url="obs://huaweitesttong/testgeneration/data/", dst_url='./data/')
mox.file.copy_parallel(src_url="obs://huaweitesttong/testgeneration/src/", dst_url='./src/')
步骤2.然后就是导入依赖库(这里还有个坑:你的python必须是3.9.0或者是3.7.5的,而且还得去安装mindspore模块)
import os
import re
import time
import numpy as np
import mindspore.dataset as de
import mindspore.common.dtype as mstype
import mindspore.dataset.transforms.c_transforms as C
from mindspore import context
from mindspore import log as logger
from mindspore.train.model import Model
from mindspore.common.tensor import Tensor
from mindspore.train.serialization import export
from mindspore.common.parameter import Parameter
from mindspore.nn.optim import AdamWeightDecay, Lamb, Momentum
from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell
from mindspore.train.callback import CheckpointConfig, ModelCheckpoint
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from easydict import EasyDict as edict
from src.bert_model import BertConfig
from src.poetry_dataset import create_tokenizer, padding
from src.utils import BertPoetry, BertPoetryCell, BertLearningRate, BertPoetryModel, LossCallBack
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=0)
#定义配置参数
bs = 16
cfg = edict({
'dict_path': './data/vocab.txt',
'disallowed_words': ['(', ')', '(', ')', '__', '《', '》', '【', '】', '[', ']'],
'max_len': 64,
'min_word_frequency': 8,
'dataset_path': './data/poetry.txt',
'batch_size': bs,
'epoch_num': 2,
'ckpt_prefix': 'poetry',
'ckpt_dir': './data/checkpoint/',
'pre_training_ckpt': './data/pretrain_ckpt/bert_base.ckpt',
'optimizer': 'AdamWeightDecayDynamicLR',
'AdamWeightDecay': edict({
'learning_rate': 3e-5,
'end_learning_rate': 1e-10,
'power': 1.0,
'weight_decay': 1e-5,
'eps': 1e-6,
}),
'Lamb': edict({
'start_learning_rate': 2e-5,
'end_learning_rate': 1e-7,
'power': 1.0,
'weight_decay': 0.01,
'decay_filter': lambda x: False,
}),
'Momentum': edict({
'learning_rate': 2e-5,
'momentum': 0.9,
}),
})
bert_net_cfg = BertConfig(
batch_size=bs,
seq_length=128,
vocab_size=3191,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
use_relative_positions=False,
input_mask_from_dataset=True,
token_type_ids_from_dataset=True,
dtype=mstype.float32,
compute_type=mstype.float16,
)
#加载数据
class PoetryDataGenerator(object):
def __init__(self, batch_size, poetry, tokenizer, length=128):
self.data = poetry
self.batch_size = batch_size
self.tokenizer = tokenizer
self.length = length
def __getitem__(self, index):
np.random.shuffle(self.data)
current_data = self.data[index]
token_ids, segment_ids = self.tokenizer.encode(current_data)
batch_token_ids = padding(token_ids, length=self.length)
batch_segment_ids = padding(segment_ids, length=self.length)
pad_mask = (batch_token_ids != 0).astype(np.float32)
return (batch_token_ids, batch_segment_ids, pad_mask)
def __len__(self):
return len(self.data)
def create_poetry_dataset(batch_size, poetry, tokenizer):
dt = PoetryDataGenerator(batch_size, poetry, tokenizer)
ds = de.GeneratorDataset(dt, ["input_ids", "token_type_id", "pad_mask"])
#ds.set_dataset_size(dt.__len__())
int_type_cast_op = C.TypeCast(mstype.int32)
float_type_cast_op = C.TypeCast(mstype.float32)
ds = ds.map(input_columns="input_ids", operations=int_type_cast_op)
ds = ds.map(input_columns="token_type_id", operations=int_type_cast_op)
ds = ds.map(input_columns="pad_mask", operations=float_type_cast_op)
ds = ds.batch(batch_size, drop_remainder=True)
return ds
#输入
poetry, tokenizer, keep_words = create_tokenizer(cfg)
num_tokens = len(keep_words)
dataset = create_poetry_dataset(bert_net_cfg.batch_size, poetry, tokenizer)
next(dataset.create_dict_iterator())
#模型训练
def train():
'''
build and train bert model for poetry generation
'''
poetrymodel = BertPoetryModel(bert_net_cfg, True, num_tokens, dropout_prob=0.1)
netwithloss = BertPoetry(poetrymodel, bert_net_cfg, True, dropout_prob=0.1)
callback = LossCallBack(poetrymodel)
# optimizer
steps_per_epoch = dataset.get_dataset_size()
print("============ steps_per_epoch is {}".format(steps_per_epoch))
lr_schedule = BertLearningRate(learning_rate=cfg.AdamWeightDecay.learning_rate,
end_learning_rate=cfg.AdamWeightDecay.end_learning_rate,
warmup_steps=1000,
decay_steps=cfg.epoch_num*steps_per_epoch,
power=cfg.AdamWeightDecay.power)
optimizer = AdamWeightDecay(netwithloss.trainable_params(), lr_schedule)
# load checkpoint into network
ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=1)
ckpoint_cb = ModelCheckpoint(prefix=cfg.ckpt_prefix, directory=cfg.ckpt_dir, config=ckpt_config)
param_dict = load_checkpoint(cfg.pre_training_ckpt)
new_dict = {}
# load corresponding rows of embedding_lookup
for key in param_dict:
if "bert_embedding_lookup" not in key:
new_dict[key] = param_dict[key]
else:
value = param_dict[key]
np_value = value.data.asnumpy()
np_value = np_value[keep_words]
tensor_value = Tensor(np_value, mstype.float32)
parameter_value = Parameter(tensor_value, name=key)
new_dict[key] = parameter_value
load_param_into_net(netwithloss, new_dict)
update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000)
netwithgrads = BertPoetryCell(netwithloss, optimizer=optimizer, scale_update_cell=update_cell)
model = Model(netwithgrads)
model.train(cfg.epoch_num, dataset, callbacks=[callback, ckpoint_cb], dataset_sink_mode=True)
#启动训练
train()
#测试评估
def generate_head_poetry(model, head=""):
token_ids, segment_ids = tokenizer.encode('')
token_ids = token_ids[:-1]
segment_ids = segment_ids[:-1]
punctuations = [',', '。']
punctuation_ids = [tokenizer._token_to_id[token] for token in punctuations]
poetry = []
length = 128
for ch in head:
poetry.append(ch)
token_id = tokenizer._token_to_id[ch]
token_ids.append(token_id)
segment_ids.append(0)
while True:
index = len(token_ids)
_target_ids = padding(np.array(token_ids), length=length)
_segment_ids = padding(np.array(segment_ids), length=length)
pad_mask = (_target_ids != 0).astype(np.float32)
_target_ids = Tensor([_target_ids], mstype.int32)
_segment_ids = Tensor([_segment_ids], mstype.int32)
pad_mask = Tensor([pad_mask], mstype.float32)
_probas = model(_target_ids, _segment_ids, pad_mask).asnumpy()
_probas = _probas[0, index-1, 3:]
p_args = _probas.argsort()[::-1][:100]
p = _probas[p_args]
p = p / sum(p)
target_index = np.random.choice(len(p), p=p)
target = p_args[target_index] + 3
token_ids.append(target)
segment_ids.append(0)
if target > 3:
poetry.append(tokenizer._id_to_token[target])
if target in punctuation_ids:
break
return ''.join(poetry)
!ls data/checkpoint
#加载恢复离线模型
bert_net_cfg.batch_size = 1
poetrymodel = BertPoetryModel(bert_net_cfg, False, 3191, dropout_prob=0.0)
poetrymodel.set_train(False)
ckpt_path = './data/checkpoint/poetry-2_1535.ckpt'
param_dict = load_checkpoint(ckpt_path)
load_param_into_net(poetrymodel, param_dict)
#输入
generate_head_poetry(poetrymodel, "人工智能")
generate_head_poetry(poetrymodel, "自然语言处理")
#本实验都是来源于华为的文本生成实验手册,只是里面很多内容写的不清楚我这边给实验者排除会遇到的坑