Référence : Badhanau et al (2015)
Référence : Vaswani et al. (2017)
transformers
¶Cette partie est adaptée du cours 🤗 que je vous encourage très fortement à lire en détails !
import torch
import transformers
On va utiliser DistilBERT (Sahn et al, 2019), une version de BERT compressée par distillation. Elle a l'avantage d'être considérablement plus légère (donc gentille avec les notebooks), tout en étant presque aussi performante.
bert_model = transformers.AutoModel.from_pretrained("distilbert-base-multilingual-cased")
Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight'] - This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
bert_model.config
DistilBertConfig { "_name_or_path": "distilbert-base-multilingual-cased", "activation": "gelu", "architectures": [ "DistilBertForMaskedLM" ], "attention_dropout": 0.1, "dim": 768, "dropout": 0.1, "hidden_dim": 3072, "initializer_range": 0.02, "max_position_embeddings": 512, "model_type": "distilbert", "n_heads": 12, "n_layers": 6, "output_past": true, "pad_token_id": 0, "qa_dropout": 0.1, "seq_classif_dropout": 0.2, "sinusoidal_pos_embds": false, "tie_weights_": true, "transformers_version": "4.21.1", "vocab_size": 119547 }
tokenizer = transformers.AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
tokenizer.tokenize("Morgan reconnait l'existence du kiwi.")
['Morgan', 're', '##con', '##nait', 'l', "'", 'existence', 'du', 'ki', '##wi', '.']
with torch.no_grad():
transformer_output = bert_model(**tokenizer("Morgan reconnait l'existence du kiwi.", return_tensors="pt"))
transformer_output.keys()
odict_keys(['last_hidden_state'])
final_embeddings = transformer_output.last_hidden_state
display(final_embeddings)
display(final_embeddings.shape)
tensor([[[ 0.0838, -0.0613, 0.2639, ..., -0.0054, 0.0605, -0.1246], [-0.0150, -0.2139, 0.8563, ..., -0.2401, 0.0096, -0.3151], [-0.3025, 0.0480, 0.2955, ..., -0.0666, 0.2111, -0.0942], ..., [-0.0193, -0.0999, 0.8865, ..., -0.2074, -0.1313, 0.0032], [ 0.1320, -0.1747, 0.6952, ..., -0.1712, 0.0451, -0.1170], [ 0.1944, -0.2817, 0.6412, ..., -0.0965, 0.1687, -0.0757]]])
torch.Size([1, 13, 768])
with torch.no_grad():
transformer_output = bert_model(
**tokenizer("Morgan reconnait l'existence du kiwi.",return_tensors="pt"),
output_hidden_states=True
)
transformer_output.keys()
odict_keys(['last_hidden_state', 'hidden_states'])
all_embeddings = transformer_output.hidden_states
display(all_embeddings)
display([e.shape for e in all_embeddings])
(tensor([[[-1.6674, 0.2950, 0.0995, ..., 0.3734, -0.1197, -0.2644], [ 0.2892, 0.0066, 0.3684, ..., -0.8158, -0.6765, 0.2650], [-1.2575, -0.7843, 0.2450, ..., 0.4168, -0.1072, -1.5565], ..., [-0.4920, -0.2065, -0.8416, ..., -0.3397, 0.5642, 0.6602], [ 0.0333, 0.3280, 0.4192, ..., -0.5410, 0.1095, -0.1245], [ 0.5600, -0.3318, 0.2777, ..., -0.1388, 0.5796, -0.0112]]]), tensor([[[-0.1484, -0.0141, -0.0188, ..., 0.0253, -0.0633, 0.0555], [-0.0337, 0.3092, 0.0693, ..., -0.0691, -0.6481, 0.5697], [-0.7947, -0.1076, -0.5636, ..., 0.6161, -0.3455, -1.0741], ..., [-0.4187, 0.5363, -0.8758, ..., -0.1789, 0.2832, 0.5478], [-0.0585, 0.2529, 0.2344, ..., -0.4691, -0.2596, -0.0422], [ 0.5647, -0.0543, 0.0201, ..., 0.0291, 0.1009, 0.0432]]]), tensor([[[-0.0098, 0.0093, 0.0489, ..., -0.0630, -0.0303, 0.0923], [-0.3779, 0.0607, 0.5345, ..., 0.0588, -0.4289, 0.5428], [-0.1046, -0.2210, -0.5352, ..., 0.5416, -0.2393, -0.8234], ..., [-0.5761, 0.2216, -0.0228, ..., 0.3734, -0.1081, -0.1992], [-0.1978, 0.0790, 0.4234, ..., -0.4188, -0.5024, -0.5788], [ 0.1575, -0.3394, 0.1450, ..., -0.0465, -0.1249, -0.1153]]]), tensor([[[-0.2005, 0.0226, -0.0520, ..., -0.0932, 0.1626, 0.1144], [-0.6412, 0.0658, 0.6553, ..., -0.1208, -0.3212, 0.3506], [-0.6291, -0.1628, 0.0385, ..., 0.5959, -0.3689, -0.9464], ..., [-0.2684, -0.1826, -0.1068, ..., 0.3560, 0.0260, 0.3446], [-0.2054, -0.1081, 0.2567, ..., -0.5167, -0.2874, -0.2840], [-0.0055, 0.0084, 0.0424, ..., -0.0581, -0.0183, 0.0354]]]), tensor([[[-0.1210, -0.1969, 0.0277, ..., 0.0199, -0.1496, 0.3188], [ 0.0666, -0.0678, 0.6049, ..., -0.0261, -0.6018, 0.3004], [-0.0920, -0.0919, -0.2963, ..., 0.5922, -0.4955, -0.6195], ..., [ 0.0781, 0.2227, 0.2286, ..., 0.2290, -0.1557, -0.1891], [-0.0873, -0.3004, 0.3723, ..., -0.3070, -0.3035, -0.0974], [-0.0076, -0.0261, 0.0196, ..., -0.0852, -0.0589, 0.0855]]]), tensor([[[ 0.0030, -0.1408, -0.0333, ..., -0.2426, -0.1485, -0.0193], [ 0.0024, -0.1793, 0.5014, ..., -0.3036, -0.3411, -0.2827], [-0.2844, 0.1422, -0.2259, ..., 0.0542, -0.2185, -0.1863], ..., [-0.0204, -0.0262, 0.3143, ..., 0.1757, -0.2743, -0.2989], [ 0.0573, -0.1484, 0.3797, ..., -0.3647, -0.1508, -0.0013], [ 0.0568, -0.1357, 0.0571, ..., 0.0514, -0.0201, 0.1149]]]), tensor([[[ 0.0838, -0.0613, 0.2639, ..., -0.0054, 0.0605, -0.1246], [-0.0150, -0.2139, 0.8563, ..., -0.2401, 0.0096, -0.3151], [-0.3025, 0.0480, 0.2955, ..., -0.0666, 0.2111, -0.0942], ..., [-0.0193, -0.0999, 0.8865, ..., -0.2074, -0.1313, 0.0032], [ 0.1320, -0.1747, 0.6952, ..., -0.1712, 0.0451, -0.1170], [ 0.1944, -0.2817, 0.6412, ..., -0.0965, 0.1687, -0.0757]]]))
[torch.Size([1, 13, 768]), torch.Size([1, 13, 768]), torch.Size([1, 13, 768]), torch.Size([1, 13, 768]), torch.Size([1, 13, 768]), torch.Size([1, 13, 768]), torch.Size([1, 13, 768])]
# Pour ne pas manger toute la RAM
del bert_model
del tokenizer
classifier = transformers.pipeline("sentiment-analysis")
No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english). Using a pipeline without specifying a model name and revision in production is not recommended.
classifier("There are not many sentiment analysis models on 🤗 hub")
[{'label': 'NEGATIVE', 'score': 0.9971767663955688}]
lm = transformers.pipeline("fill-mask", model="distilbert-base-multilingual-cased")
lm(f"En France, c'est l'Université de {lm.tokenizer.mask_token} la meilleure.")
[{'score': 0.04588303342461586, 'token': 10688, 'token_str': 'France', 'sequence': "En France, c'est l'Université de France la meilleure."}, {'score': 0.021794594824314117, 'token': 20642, 'token_str': 'Chemistry', 'sequence': "En France, c'est l'Université de Chemistry la meilleure."}, {'score': 0.01765727624297142, 'token': 38069, 'token_str': 'Montpellier', 'sequence': "En France, c'est l'Université de Montpellier la meilleure."}, {'score': 0.015636147931218147, 'token': 32134, 'token_str': 'Nantes', 'sequence': "En France, c'est l'Université de Nantes la meilleure."}, {'score': 0.014034918509423733, 'token': 22384, 'token_str': 'langue', 'sequence': "En France, c'est l'Université de langue la meilleure."}]
Pour les autres : voire la liste des pipelines dans la doc.
Attention : tous les modèles n'ont pas étés affinés pour toutes les pipelines, vous pouvez chercher sur le hub de les modèles entraînés pour une tâche et une langue donnée. Par exemple les modèles de NER.
Et s'il n'y en a pas ? On peut en entraîner un !
On va utiliser une des tâches du multi-benchmark GLUE, dans sa version 🤗 datasets.
import datasets
Il y a beaucoupd de sous-tâches dans GLUE (c'est le principe), on va commencer par regarder celle qui nous est peut-être la plus familière : la détection de polarité (sentiment analysis), avec le corpus Stanford Sentiment Treebank version 2, sst2
.
raw_datasets = datasets.load_dataset("glue", "sst2")
raw_datasets
Downloading and preparing dataset glue/sst2 (download: 7.09 MiB, generated: 4.81 MiB, post-processed: Unknown size, total: 11.90 MiB) to /home/runner/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...
Dataset glue downloaded and prepared to /home/runner/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.
DatasetDict({ train: Dataset({ features: ['sentence', 'label', 'idx'], num_rows: 67349 }) validation: Dataset({ features: ['sentence', 'label', 'idx'], num_rows: 872 }) test: Dataset({ features: ['sentence', 'label', 'idx'], num_rows: 1821 }) })
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[6]
{'sentence': 'demonstrates that the director of such hollywood blockbusters as patriot games can still turn out a small , personal film with an emotional wallop . ', 'label': 1, 'idx': 6}
raw_train_dataset.features
{'sentence': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['negative', 'positive'], id=None), 'idx': Value(dtype='int32', id=None)}
raw_train_dataset[16:32]
{'sentence': ['cold movie ', 'with his usual intelligence and subtlety ', 'redundant concept ', "swimming is above all about a young woman 's face , and by casting an actress whose face projects that woman 's doubts and yearnings , it succeeds . ", 'equals the original and in some ways even betters it ', 'if anything , see it for karen black , who camps up a storm as a fringe feminist conspiracy theorist named dirty dick . ', 'a smile on your face ', 'comes from the brave , uninhibited performances ', 'excruciatingly unfunny and pitifully unromantic ', 'enriched by an imaginatively mixed cast of antic spirits ', "which half of dragonfly is worse : the part where nothing 's happening , or the part where something 's happening ", 'in world cinema ', 'very good viewing alternative ', 'the plot is nothing but boilerplate clichés from start to finish , ', 'the action is stilted ', 'on all cylinders '], 'label': [0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1], 'idx': [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]}
raw_train_dataset["sentence"][9]
"are more deeply thought through than in most ` right-thinking ' films "
tokenizer = transformers.AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
tokenized_sent = tokenizer(raw_datasets["train"]["sentence"][9])
tokenized_sent
{'input_ids': [101, 10301, 10798, 78846, 18957, 11222, 11084, 10106, 10992, 100, 13448, 118, 56294, 112, 14280, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
tokenizer.convert_ids_to_tokens(tokenized_sent["input_ids"])
['[CLS]', 'are', 'more', 'deeply', 'thought', 'through', 'than', 'in', 'most', '[UNK]', 'right', '-', 'thinking', "'", 'films', '[SEP]']
tokenizer(raw_datasets["train"]["sentence"])[:8]
[Encoding(num_tokens=11, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]), Encoding(num_tokens=11, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]), Encoding(num_tokens=16, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]), Encoding(num_tokens=13, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]), Encoding(num_tokens=23, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]), Encoding(num_tokens=15, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]), Encoding(num_tokens=34, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]), Encoding(num_tokens=5, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]
tokenized_dataset = tokenizer(
raw_datasets["train"]["sentence"],
padding=True,
truncation=True,
)
tokenized_dataset[0]
Encoding(num_tokens=74, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
tokenized_dataset[0].tokens
['[CLS]', 'hide', 'new', 'secret', '##ions', 'from', 'the', 'parent', '##al', 'units', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
del tokenized_dataset
def tokenize_function(example):
return tokenizer(example["sentence"], truncation=True)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets
DatasetDict({ train: Dataset({ features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'], num_rows: 67349 }) validation: Dataset({ features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'], num_rows: 872 }) test: Dataset({ features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'], num_rows: 1821 }) })
data_collator = transformers.DataCollatorWithPadding(tokenizer=tokenizer)
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence"]}
[len(x) for x in samples["input_ids"]]
[11, 11, 16, 13, 23, 15, 34, 5]
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}
{'input_ids': torch.Size([8, 34]), 'attention_mask': torch.Size([8, 34]), 'labels': torch.Size([8])}
classifier = transformers.AutoModelForSequenceClassification.from_pretrained(
"distilbert-base-multilingual-cased", num_labels=2
)
classifier
Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight'] - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
DistilBertForSequenceClassification( (distilbert): DistilBertModel( (embeddings): Embeddings( (word_embeddings): Embedding(119547, 768, padding_idx=0) (position_embeddings): Embedding(512, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (transformer): Transformer( (layer): ModuleList( (0): TransformerBlock( (attention): MultiHeadSelfAttention( (dropout): Dropout(p=0.1, inplace=False) (q_lin): Linear(in_features=768, out_features=768, bias=True) (k_lin): Linear(in_features=768, out_features=768, bias=True) (v_lin): Linear(in_features=768, out_features=768, bias=True) (out_lin): Linear(in_features=768, out_features=768, bias=True) ) (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (ffn): FFN( (dropout): Dropout(p=0.1, inplace=False) (lin1): Linear(in_features=768, out_features=3072, bias=True) (lin2): Linear(in_features=3072, out_features=768, bias=True) (activation): GELUActivation() ) (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) ) (1): TransformerBlock( (attention): MultiHeadSelfAttention( (dropout): Dropout(p=0.1, inplace=False) (q_lin): Linear(in_features=768, out_features=768, bias=True) (k_lin): Linear(in_features=768, out_features=768, bias=True) (v_lin): Linear(in_features=768, out_features=768, bias=True) (out_lin): Linear(in_features=768, out_features=768, bias=True) ) (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (ffn): FFN( (dropout): Dropout(p=0.1, inplace=False) (lin1): Linear(in_features=768, out_features=3072, bias=True) (lin2): Linear(in_features=3072, out_features=768, bias=True) (activation): GELUActivation() ) (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) ) (2): TransformerBlock( (attention): MultiHeadSelfAttention( (dropout): Dropout(p=0.1, inplace=False) (q_lin): Linear(in_features=768, out_features=768, bias=True) (k_lin): Linear(in_features=768, out_features=768, bias=True) (v_lin): Linear(in_features=768, out_features=768, bias=True) (out_lin): Linear(in_features=768, out_features=768, bias=True) ) (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (ffn): FFN( (dropout): Dropout(p=0.1, inplace=False) (lin1): Linear(in_features=768, out_features=3072, bias=True) (lin2): Linear(in_features=3072, out_features=768, bias=True) (activation): GELUActivation() ) (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) ) (3): TransformerBlock( (attention): MultiHeadSelfAttention( (dropout): Dropout(p=0.1, inplace=False) (q_lin): Linear(in_features=768, out_features=768, bias=True) (k_lin): Linear(in_features=768, out_features=768, bias=True) (v_lin): Linear(in_features=768, out_features=768, bias=True) (out_lin): Linear(in_features=768, out_features=768, bias=True) ) (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (ffn): FFN( (dropout): Dropout(p=0.1, inplace=False) (lin1): Linear(in_features=768, out_features=3072, bias=True) (lin2): Linear(in_features=3072, out_features=768, bias=True) (activation): GELUActivation() ) (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) ) (4): TransformerBlock( (attention): MultiHeadSelfAttention( (dropout): Dropout(p=0.1, inplace=False) (q_lin): Linear(in_features=768, out_features=768, bias=True) (k_lin): Linear(in_features=768, out_features=768, bias=True) (v_lin): Linear(in_features=768, out_features=768, bias=True) (out_lin): Linear(in_features=768, out_features=768, bias=True) ) (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (ffn): FFN( (dropout): Dropout(p=0.1, inplace=False) (lin1): Linear(in_features=768, out_features=3072, bias=True) (lin2): Linear(in_features=3072, out_features=768, bias=True) (activation): GELUActivation() ) (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) ) (5): TransformerBlock( (attention): MultiHeadSelfAttention( (dropout): Dropout(p=0.1, inplace=False) (q_lin): Linear(in_features=768, out_features=768, bias=True) (k_lin): Linear(in_features=768, out_features=768, bias=True) (v_lin): Linear(in_features=768, out_features=768, bias=True) (out_lin): Linear(in_features=768, out_features=768, bias=True) ) (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (ffn): FFN( (dropout): Dropout(p=0.1, inplace=False) (lin1): Linear(in_features=768, out_features=3072, bias=True) (lin2): Linear(in_features=3072, out_features=768, bias=True) (activation): GELUActivation() ) (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) ) ) ) ) (pre_classifier): Linear(in_features=768, out_features=768, bias=True) (classifier): Linear(in_features=768, out_features=2, bias=True) (dropout): Dropout(p=0.2, inplace=False) )
sentiment_pipeline = transformers.pipeline("sentiment-analysis", model=classifier, tokenizer=tokenizer)
sentiment_pipeline("This movie is not so bad")
[{'label': 'LABEL_0', 'score': 0.518805980682373}]
training_args = transformers.TrainingArguments(
gradient_accumulation_steps=2,
logging_steps=8,
max_steps=64,
output_dir="local/distilbert-base-multilingual-cased+sst2",
per_device_train_batch_size=4,
report_to="none",
warmup_ratio=1/16,
)
trainer = transformers.Trainer(
classifier,
training_args,
train_dataset=tokenized_datasets["train"],
data_collator=data_collator,
tokenizer=tokenizer,
)
max_steps is given, it will override any value given in num_train_epochs
trainer.train()
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `DistilBertForSequenceClassification.forward`, you can safely ignore this message. /opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning warnings.warn( ***** Running training ***** Num examples = 67349 Num Epochs = 1 Instantaneous batch size per device = 4 Total train batch size (w. parallel, distributed & accumulation) = 8 Gradient Accumulation steps = 2 Total optimization steps = 64
Step | Training Loss |
---|---|
8 | 0.687000 |
16 | 0.667000 |
24 | 0.709600 |
32 | 0.673300 |
40 | 0.660800 |
48 | 0.610300 |
56 | 0.589100 |
64 | 0.533800 |
Training completed. Do not forget to share your model on huggingface.co/models =)
TrainOutput(global_step=64, training_loss=0.6413654834032059, metrics={'train_runtime': 117.8238, 'train_samples_per_second': 4.345, 'train_steps_per_second': 0.543, 'total_flos': 3490722934896.0, 'train_loss': 0.6413654834032059, 'epoch': 0.01})
Les paramètres du modèle ont été directement modifiés et on peut l'utiliser tout de suite.
Bien penser à mettre le modèle en mode évaluation, sinon les résultats seront partiellement aléatoires
classifier.eval()
DistilBertForSequenceClassification( (distilbert): DistilBertModel( (embeddings): Embeddings( (word_embeddings): Embedding(119547, 768, padding_idx=0) (position_embeddings): Embedding(512, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (transformer): Transformer( (layer): ModuleList( (0): TransformerBlock( (attention): MultiHeadSelfAttention( (dropout): Dropout(p=0.1, inplace=False) (q_lin): Linear(in_features=768, out_features=768, bias=True) (k_lin): Linear(in_features=768, out_features=768, bias=True) (v_lin): Linear(in_features=768, out_features=768, bias=True) (out_lin): Linear(in_features=768, out_features=768, bias=True) ) (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (ffn): FFN( (dropout): Dropout(p=0.1, inplace=False) (lin1): Linear(in_features=768, out_features=3072, bias=True) (lin2): Linear(in_features=3072, out_features=768, bias=True) (activation): GELUActivation() ) (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) ) (1): TransformerBlock( (attention): MultiHeadSelfAttention( (dropout): Dropout(p=0.1, inplace=False) (q_lin): Linear(in_features=768, out_features=768, bias=True) (k_lin): Linear(in_features=768, out_features=768, bias=True) (v_lin): Linear(in_features=768, out_features=768, bias=True) (out_lin): Linear(in_features=768, out_features=768, bias=True) ) (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (ffn): FFN( (dropout): Dropout(p=0.1, inplace=False) (lin1): Linear(in_features=768, out_features=3072, bias=True) (lin2): Linear(in_features=3072, out_features=768, bias=True) (activation): GELUActivation() ) (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) ) (2): TransformerBlock( (attention): MultiHeadSelfAttention( (dropout): Dropout(p=0.1, inplace=False) (q_lin): Linear(in_features=768, out_features=768, bias=True) (k_lin): Linear(in_features=768, out_features=768, bias=True) (v_lin): Linear(in_features=768, out_features=768, bias=True) (out_lin): Linear(in_features=768, out_features=768, bias=True) ) (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (ffn): FFN( (dropout): Dropout(p=0.1, inplace=False) (lin1): Linear(in_features=768, out_features=3072, bias=True) (lin2): Linear(in_features=3072, out_features=768, bias=True) (activation): GELUActivation() ) (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) ) (3): TransformerBlock( (attention): MultiHeadSelfAttention( (dropout): Dropout(p=0.1, inplace=False) (q_lin): Linear(in_features=768, out_features=768, bias=True) (k_lin): Linear(in_features=768, out_features=768, bias=True) (v_lin): Linear(in_features=768, out_features=768, bias=True) (out_lin): Linear(in_features=768, out_features=768, bias=True) ) (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (ffn): FFN( (dropout): Dropout(p=0.1, inplace=False) (lin1): Linear(in_features=768, out_features=3072, bias=True) (lin2): Linear(in_features=3072, out_features=768, bias=True) (activation): GELUActivation() ) (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) ) (4): TransformerBlock( (attention): MultiHeadSelfAttention( (dropout): Dropout(p=0.1, inplace=False) (q_lin): Linear(in_features=768, out_features=768, bias=True) (k_lin): Linear(in_features=768, out_features=768, bias=True) (v_lin): Linear(in_features=768, out_features=768, bias=True) (out_lin): Linear(in_features=768, out_features=768, bias=True) ) (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (ffn): FFN( (dropout): Dropout(p=0.1, inplace=False) (lin1): Linear(in_features=768, out_features=3072, bias=True) (lin2): Linear(in_features=3072, out_features=768, bias=True) (activation): GELUActivation() ) (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) ) (5): TransformerBlock( (attention): MultiHeadSelfAttention( (dropout): Dropout(p=0.1, inplace=False) (q_lin): Linear(in_features=768, out_features=768, bias=True) (k_lin): Linear(in_features=768, out_features=768, bias=True) (v_lin): Linear(in_features=768, out_features=768, bias=True) (out_lin): Linear(in_features=768, out_features=768, bias=True) ) (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (ffn): FFN( (dropout): Dropout(p=0.1, inplace=False) (lin1): Linear(in_features=768, out_features=3072, bias=True) (lin2): Linear(in_features=3072, out_features=768, bias=True) (activation): GELUActivation() ) (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) ) ) ) ) (pre_classifier): Linear(in_features=768, out_features=768, bias=True) (classifier): Linear(in_features=768, out_features=2, bias=True) (dropout): Dropout(p=0.2, inplace=False) )
On peut s'en servir pour faire des prédictions directement
with torch.no_grad():
classifier_output = classifier(**tokenizer("This movie is not so bad", return_tensors="pt"))
display(classifier_output)
SequenceClassifierOutput(loss=None, logits=tensor([[ 0.0331, -0.0069]]), hidden_states=None, attentions=None)
On peut aussi l'utiliser dans la pipeline
sentiment_pipeline("This movie is not so bad")
[{'label': 'LABEL_0', 'score': 0.5099858641624451}]
Le score pour la pipeline et le logit correspondant en appliquant directement le modèle sont différent, c'est parce que la pipeline applique un softmax :
classifier_output.logits.softmax(dim=-1)
tensor([[0.5100, 0.4900]])
Si on veut utiliser le modèle affiné ailleurs, il faut le sauvegarder
classifier.save_pretrained("local/distilbert-base-multilingual-cased+sst2/model")
Configuration saved in local/distilbert-base-multilingual-cased+sst2/model/config.json Model weights saved in local/distilbert-base-multilingual-cased+sst2/model/pytorch_model.bin
On peut alors le charger avec le .from_pretrained("local/distilbert-base-multilingual-cased+sst2/model")
qui va bien, par exemple.
classifier = transformers.AutoModelForSequenceClassification.from_pretrained("local/distilbert-base-multilingual-cased+sst2/model")
loading configuration file local/distilbert-base-multilingual-cased+sst2/model/config.json Model config DistilBertConfig { "_name_or_path": "local/distilbert-base-multilingual-cased+sst2/model", "activation": "gelu", "architectures": [ "DistilBertForSequenceClassification" ], "attention_dropout": 0.1, "dim": 768, "dropout": 0.1, "hidden_dim": 3072, "initializer_range": 0.02, "max_position_embeddings": 512, "model_type": "distilbert", "n_heads": 12, "n_layers": 6, "output_past": true, "pad_token_id": 0, "problem_type": "single_label_classification", "qa_dropout": 0.1, "seq_classif_dropout": 0.2, "sinusoidal_pos_embds": false, "tie_weights_": true, "torch_dtype": "float32", "transformers_version": "4.21.1", "vocab_size": 119547 } loading weights file local/distilbert-base-multilingual-cased+sst2/model/pytorch_model.bin All model checkpoint weights were used when initializing DistilBertForSequenceClassification. All the weights of DistilBertForSequenceClassification were initialized from the model checkpoint at local/distilbert-base-multilingual-cased+sst2/model. If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForSequenceClassification for predictions without further training.
C'est une bonne pratique de sauvegarder le tokenizer avec.
tokenizer.save_pretrained("local/distilbert-base-multilingual-cased+sst2/model")
tokenizer config file saved in local/distilbert-base-multilingual-cased+sst2/model/tokenizer_config.json Special tokens file saved in local/distilbert-base-multilingual-cased+sst2/model/special_tokens_map.json
('local/distilbert-base-multilingual-cased+sst2/model/tokenizer_config.json', 'local/distilbert-base-multilingual-cased+sst2/model/special_tokens_map.json', 'local/distilbert-base-multilingual-cased+sst2/model/vocab.txt', 'local/distilbert-base-multilingual-cased+sst2/model/added_tokens.json', 'local/distilbert-base-multilingual-cased+sst2/model/tokenizer.json')
predictions = trainer.predict(tokenized_datasets["validation"])
print(type(predictions.predictions), predictions.predictions.shape, predictions.label_ids.shape)
The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `DistilBertForSequenceClassification.forward`, you can safely ignore this message. ***** Running Prediction ***** Num examples = 872 Batch size = 8
<class 'numpy.ndarray'> (872, 2) (872,)
predicted_labels = predictions.predictions.argmax(axis=-1)
predicted_labels
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1])
predicted_labels == predictions.label_ids
array([ True, False, True, True, False, True, False, False, True, True, True, False, False, True, False, True, True, True, False, False, False, True, True, True, True, False, True, True, False, False, True, False, True, False, True, False, True, False, True, True, True, True, True, True, False, False, False, True, True, False, False, True, True, True, False, True, False, False, False, True, True, False, True, True, True, True, False, True, True, False, False, True, True, True, False, True, True, True, True, False, True, True, False, False, True, True, True, True, True, True, True, True, True, False, True, False, False, False, False, False, True, False, True, True, True, False, True, True, True, True, False, True, True, False, True, False, True, True, False, False, True, False, True, False, True, True, True, False, True, True, False, True, True, False, False, False, False, False, True, False, False, False, True, True, True, False, True, False, False, True, True, True, True, True, True, False, True, False, False, True, False, True, True, True, True, True, False, True, True, True, True, True, False, False, True, True, False, False, False, True, False, True, False, False, False, True, True, True, False, False, True, True, True, True, True, True, True, True, True, True, True, False, True, False, True, True, True, True, True, True, False, True, False, True, False, False, True, True, False, True, True, True, True, True, False, True, True, True, True, True, True, True, True, False, True, False, True, False, False, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, False, True, False, True, True, True, True, True, False, True, True, False, False, False, False, False, True, True, False, False, False, True, True, True, False, True, True, False, False, True, False, False, True, True, True, True, True, False, False, True, True, True, False, True, True, True, True, False, True, True, True, True, True, True, True, False, False, True, True, True, True, False, True, True, False, True, True, True, False, True, True, True, True, True, False, False, False, True, True, True, True, True, True, False, False, True, False, True, True, True, True, True, False, False, False, True, False, True, True, True, True, False, True, False, False, True, True, False, True, False, True, False, True, True, False, True, True, True, True, True, True, True, False, True, False, False, False, True, False, True, True, True, True, True, True, False, True, False, False, True, False, False, True, True, True, True, True, True, False, False, True, True, True, True, False, False, True, True, False, True, True, True, True, True, True, True, False, True, True, False, True, True, True, True, True, False, True, False, False, True, False, True, False, True, True, True, True, False, False, True, False, True, False, True, True, False, True, True, False, True, False, True, False, True, True, True, True, True, False, False, True, True, True, True, True, False, True, False, True, True, False, True, False, False, False, True, False, True, True, False, True, True, True, True, True, True, False, False, True, True, False, True, True, False, True, True, False, True, False, False, True, True, True, True, True, True, True, True, False, False, True, True, False, True, True, False, True, True, False, True, True, True, True, False, True, True, True, True, True, True, True, True, False, True, False, True, False, True, False, False, False, True, True, False, False, True, False, False, False, True, False, False, True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, False, True, True, False, False, False, True, False, False, True, False, False, False, False, False, False, True, True, False, False, True, True, True, True, True, False, False, False, False, False, False, True, True, False, True, False, True, True, True, True, True, False, True, True, False, True, False, True, False, False, True, True, True, True, True, True, True, True, False, True, False, False, True, True, True, True, False, True, False, True, True, True, False, True, True, True, False, True, False, False, False, False, False, False, True, True, False, True, True, True, False, True, True, True, False, True, True, True, True, True, False, False, False, True, True, False, True, False, False, True, False, False, True, True, False, True, False, True, False, False, True, False, False, False, True, True, True, True, True, False, True, False, False, False, True, True, True, True, True, True, True, True, True, True, False, False, True, True, True, True, True, False, True, True, False, True, False, False, True, True, False, False, False, True, False, False, True, True, True, True, False, True, False, True, False, True, True, False, True, False, False, False, True, True, True, True, False, True, False, True, False, True, True, False, False, True, True, True, True, True, False, True, True, False, False, True, True, True, False, True, True, False, True, True, False, False, False, True, True, False, False, True, True, True, False, True, False, False, True, False, False, True, True, False, False, True, True, True, True, True, False, True, False, False, True, True, True, False, False, True, True, True, True, False, True, True, True, True, False, True, True, False, False, False, True, False, True, True, False, False, True, False, False, True, False, False, True, True, True, True, False, False, True, False, True, True, True, True, True, True, True, False, False, False, False, False, False, True, False, True, True])
(predicted_labels == predictions.label_ids).sum()/predicted_labels.shape[0]
0.6227064220183486
metric = datasets.load_metric("glue", "sst2")
metric.compute(predictions=predicted_labels, references=predictions.label_ids)
--------------------------------------------------------------------------- ImportError Traceback (most recent call last) Input In [50], in <cell line: 1>() ----> 1 metric = datasets.load_metric("glue", "sst2") 2 metric.compute(predictions=predicted_labels, references=predictions.label_ids) File /opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/site-packages/datasets/load.py:1389, in load_metric(path, config_name, process_id, num_process, cache_dir, experiment_id, keep_in_memory, download_config, download_mode, revision, **metric_init_kwargs) 1353 """Load a `datasets.Metric`. 1354 1355 Args: (...) 1386 ``` 1387 """ 1388 download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS) -> 1389 metric_module = metric_module_factory( 1390 path, revision=revision, download_config=download_config, download_mode=download_mode 1391 ).module_path 1392 metric_cls = import_main_class(metric_module, dataset=False) 1393 metric = metric_cls( 1394 config_name=config_name, 1395 process_id=process_id, (...) 1400 **metric_init_kwargs, 1401 ) File /opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/site-packages/datasets/load.py:1331, in metric_module_factory(path, revision, download_config, download_mode, dynamic_modules_path, **download_kwargs) 1329 except Exception as e2: # noqa: if it's not in the cache, then it doesn't exist. 1330 if not isinstance(e1, FileNotFoundError): -> 1331 raise e1 from None 1332 raise FileNotFoundError( 1333 f"Couldn't find a metric script at {relative_to_absolute_path(combined_path)}. " 1334 f"Metric '{path}' doesn't exist on the Hugging Face Hub either." 1335 ) from None 1336 else: File /opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/site-packages/datasets/load.py:1319, in metric_module_factory(path, revision, download_config, download_mode, dynamic_modules_path, **download_kwargs) 1317 elif is_relative_path(path) and path.count("/") == 0: 1318 try: -> 1319 return GithubMetricModuleFactory( 1320 path, 1321 revision=revision, 1322 download_config=download_config, 1323 download_mode=download_mode, 1324 dynamic_modules_path=dynamic_modules_path, 1325 ).get_module() 1326 except Exception as e1: # noqa: all the attempts failed, before raising the error we should check if the module is already cached. 1327 try: File /opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/site-packages/datasets/load.py:557, in GithubMetricModuleFactory.get_module(self) 552 logger.warning( 553 f"Couldn't find a directory or a metric named '{self.name}' in this version. " 554 f"It was picked from the main branch on github instead." 555 ) 556 imports = get_imports(local_path) --> 557 local_imports = _download_additional_modules( 558 name=self.name, 559 base_path=hf_github_url(path=self.name, name="", revision=revision, dataset=False), 560 imports=imports, 561 download_config=self.download_config, 562 ) 563 # copy the script and the files in an importable directory 564 dynamic_modules_path = self.dynamic_modules_path if self.dynamic_modules_path else init_dynamic_modules() File /opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/site-packages/datasets/load.py:215, in _download_additional_modules(name, base_path, imports, download_config) 213 _depencencies_str = "dependencies" if len(needs_to_be_installed) > 1 else "dependency" 214 _them_str = "them" if len(needs_to_be_installed) > 1 else "it" --> 215 raise ImportError( 216 f"To be able to use {name}, you need to install the following {_depencencies_str}: " 217 f"{', '.join(needs_to_be_installed)}.\nPlease install {_them_str} using 'pip install " 218 f"{' '.join(needs_to_be_installed.values())}' for instance'" 219 ) 220 return local_imports ImportError: To be able to use glue, you need to install the following dependency: sklearn. Please install it using 'pip install sklearn' for instance'
def compute_metrics(eval_preds):
metric = datasets.load_metric("glue", "sst2")
logits, labels = eval_preds
predictions = logits.argmax(axis=-1)
return metric.compute(predictions=predictions, references=labels)
# On régénère le classifieur pour reprendre l'entraînement de zéro
classifier = transformers.AutoModelForSequenceClassification.from_pretrained(
"distilbert-base-multilingual-cased", num_labels=2
)
training_args = transformers.TrainingArguments(
evaluation_strategy="steps",
eval_steps=16,
gradient_accumulation_steps=2,
logging_steps=8,
max_steps=64,
output_dir="local/distilbert-base-mutlitlingual-cased+sst2",
per_device_train_batch_size=4,
report_to="tensorboard",
warmup_ratio=1/16,
)
trainer = transformers.Trainer(
args=training_args,
compute_metrics=compute_metrics,
data_collator=data_collator,
eval_dataset=tokenized_datasets["validation"],
model=classifier,
tokenizer=tokenizer,
train_dataset=tokenized_datasets["train"],
)
trainer.train()
loading configuration file https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/config.json from cache at /home/runner/.cache/huggingface/transformers/cf37a9dc282a679f121734d06f003625d14cfdaf55c14358c4c0b8e7e2b89ac9.7a727bd85e40715bec919a39cdd6f0aba27a8cd488f2d4e0f512448dcd02bf0f Model config DistilBertConfig { "_name_or_path": "distilbert-base-multilingual-cased", "activation": "gelu", "architectures": [ "DistilBertForMaskedLM" ], "attention_dropout": 0.1, "dim": 768, "dropout": 0.1, "hidden_dim": 3072, "initializer_range": 0.02, "max_position_embeddings": 512, "model_type": "distilbert", "n_heads": 12, "n_layers": 6, "output_past": true, "pad_token_id": 0, "qa_dropout": 0.1, "seq_classif_dropout": 0.2, "sinusoidal_pos_embds": false, "tie_weights_": true, "transformers_version": "4.21.1", "vocab_size": 119547 } loading weights file https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/pytorch_model.bin from cache at /home/runner/.cache/huggingface/transformers/7b48683e2e7ba71cd1d7d6551ac325eceee01db5c2f3e81cfbfd1ee7bb7877f2.c24097b0cf91dbc66977325325fd03112f0f13d0e3579abbffc8d1e45f8d0619 Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight'] - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. PyTorch: setting up devices max_steps is given, it will override any value given in num_train_epochs The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `DistilBertForSequenceClassification.forward`, you can safely ignore this message. ***** Running training ***** Num examples = 67349 Num Epochs = 1 Instantaneous batch size per device = 4 Total train batch size (w. parallel, distributed & accumulation) = 8 Gradient Accumulation steps = 2 Total optimization steps = 64
Step | Training Loss | Validation Loss |
---|
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `DistilBertForSequenceClassification.forward`, you can safely ignore this message. ***** Running Evaluation ***** Num examples = 872 Batch size = 8
--------------------------------------------------------------------------- ImportError Traceback (most recent call last) Input In [52], in <cell line: 30>() 7 training_args = transformers.TrainingArguments( 8 evaluation_strategy="steps", 9 eval_steps=16, (...) 16 warmup_ratio=1/16, 17 ) 19 trainer = transformers.Trainer( 20 args=training_args, 21 compute_metrics=compute_metrics, (...) 27 28 ) ---> 30 trainer.train() File /opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/site-packages/transformers/trainer.py:1498, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs) 1493 self.model_wrapped = self.model 1495 inner_training_loop = find_executable_batch_size( 1496 self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size 1497 ) -> 1498 return inner_training_loop( 1499 args=args, 1500 resume_from_checkpoint=resume_from_checkpoint, 1501 trial=trial, 1502 ignore_keys_for_eval=ignore_keys_for_eval, 1503 ) File /opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/site-packages/transformers/trainer.py:1817, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval) 1814 self.state.epoch = epoch + (step + 1) / steps_in_epoch 1815 self.control = self.callback_handler.on_step_end(args, self.state, self.control) -> 1817 self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval) 1818 else: 1819 self.control = self.callback_handler.on_substep_end(args, self.state, self.control) File /opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/site-packages/transformers/trainer.py:2038, in Trainer._maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval) 2036 metrics = None 2037 if self.control.should_evaluate: -> 2038 metrics = self.evaluate(ignore_keys=ignore_keys_for_eval) 2039 self._report_to_hp_search(trial, self.state.global_step, metrics) 2041 if self.control.should_save: File /opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/site-packages/transformers/trainer.py:2758, in Trainer.evaluate(self, eval_dataset, ignore_keys, metric_key_prefix) 2755 start_time = time.time() 2757 eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop -> 2758 output = eval_loop( 2759 eval_dataloader, 2760 description="Evaluation", 2761 # No point gathering the predictions if there are no metrics, otherwise we defer to 2762 # self.args.prediction_loss_only 2763 prediction_loss_only=True if self.compute_metrics is None else None, 2764 ignore_keys=ignore_keys, 2765 metric_key_prefix=metric_key_prefix, 2766 ) 2768 total_batch_size = self.args.eval_batch_size * self.args.world_size 2769 output.metrics.update( 2770 speed_metrics( 2771 metric_key_prefix, (...) 2775 ) 2776 ) File /opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/site-packages/transformers/trainer.py:3041, in Trainer.evaluation_loop(self, dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix) 3037 metrics = self.compute_metrics( 3038 EvalPrediction(predictions=all_preds, label_ids=all_labels, inputs=all_inputs) 3039 ) 3040 else: -> 3041 metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels)) 3042 else: 3043 metrics = {} Input In [51], in compute_metrics(eval_preds) 1 def compute_metrics(eval_preds): ----> 2 metric = datasets.load_metric("glue", "sst2") 3 logits, labels = eval_preds 4 predictions = logits.argmax(axis=-1) File /opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/site-packages/datasets/load.py:1389, in load_metric(path, config_name, process_id, num_process, cache_dir, experiment_id, keep_in_memory, download_config, download_mode, revision, **metric_init_kwargs) 1353 """Load a `datasets.Metric`. 1354 1355 Args: (...) 1386 ``` 1387 """ 1388 download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS) -> 1389 metric_module = metric_module_factory( 1390 path, revision=revision, download_config=download_config, download_mode=download_mode 1391 ).module_path 1392 metric_cls = import_main_class(metric_module, dataset=False) 1393 metric = metric_cls( 1394 config_name=config_name, 1395 process_id=process_id, (...) 1400 **metric_init_kwargs, 1401 ) File /opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/site-packages/datasets/load.py:1331, in metric_module_factory(path, revision, download_config, download_mode, dynamic_modules_path, **download_kwargs) 1329 except Exception as e2: # noqa: if it's not in the cache, then it doesn't exist. 1330 if not isinstance(e1, FileNotFoundError): -> 1331 raise e1 from None 1332 raise FileNotFoundError( 1333 f"Couldn't find a metric script at {relative_to_absolute_path(combined_path)}. " 1334 f"Metric '{path}' doesn't exist on the Hugging Face Hub either." 1335 ) from None 1336 else: File /opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/site-packages/datasets/load.py:1319, in metric_module_factory(path, revision, download_config, download_mode, dynamic_modules_path, **download_kwargs) 1317 elif is_relative_path(path) and path.count("/") == 0: 1318 try: -> 1319 return GithubMetricModuleFactory( 1320 path, 1321 revision=revision, 1322 download_config=download_config, 1323 download_mode=download_mode, 1324 dynamic_modules_path=dynamic_modules_path, 1325 ).get_module() 1326 except Exception as e1: # noqa: all the attempts failed, before raising the error we should check if the module is already cached. 1327 try: File /opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/site-packages/datasets/load.py:557, in GithubMetricModuleFactory.get_module(self) 552 logger.warning( 553 f"Couldn't find a directory or a metric named '{self.name}' in this version. " 554 f"It was picked from the main branch on github instead." 555 ) 556 imports = get_imports(local_path) --> 557 local_imports = _download_additional_modules( 558 name=self.name, 559 base_path=hf_github_url(path=self.name, name="", revision=revision, dataset=False), 560 imports=imports, 561 download_config=self.download_config, 562 ) 563 # copy the script and the files in an importable directory 564 dynamic_modules_path = self.dynamic_modules_path if self.dynamic_modules_path else init_dynamic_modules() File /opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/site-packages/datasets/load.py:215, in _download_additional_modules(name, base_path, imports, download_config) 213 _depencencies_str = "dependencies" if len(needs_to_be_installed) > 1 else "dependency" 214 _them_str = "them" if len(needs_to_be_installed) > 1 else "it" --> 215 raise ImportError( 216 f"To be able to use {name}, you need to install the following {_depencencies_str}: " 217 f"{', '.join(needs_to_be_installed)}.\nPlease install {_them_str} using 'pip install " 218 f"{' '.join(needs_to_be_installed.values())}' for instance'" 219 ) 220 return local_imports ImportError: To be able to use glue, you need to install the following dependency: sklearn. Please install it using 'pip install sklearn' for instance'
Vous pouvez voir l'évolution de l'entraînement dans tensorboard
tensorboard serve --logdir slides/06-transformers/local/distilbert-base-multilingual-cased+sst2
Affinez un modèle (pour un nombre raisonnable de pas) en français ou multilingue sur la tâche de détection de polarité du benchmark FLUE (Le et al, 2020).
del classifier
del sentiment_pipeline
tokenizer = transformers.AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
tokenizer
loading configuration file https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/config.json from cache at /home/runner/.cache/huggingface/transformers/cf37a9dc282a679f121734d06f003625d14cfdaf55c14358c4c0b8e7e2b89ac9.7a727bd85e40715bec919a39cdd6f0aba27a8cd488f2d4e0f512448dcd02bf0f Model config DistilBertConfig { "_name_or_path": "distilbert-base-multilingual-cased", "activation": "gelu", "architectures": [ "DistilBertForMaskedLM" ], "attention_dropout": 0.1, "dim": 768, "dropout": 0.1, "hidden_dim": 3072, "initializer_range": 0.02, "max_position_embeddings": 512, "model_type": "distilbert", "n_heads": 12, "n_layers": 6, "output_past": true, "pad_token_id": 0, "qa_dropout": 0.1, "seq_classif_dropout": 0.2, "sinusoidal_pos_embds": false, "tie_weights_": true, "transformers_version": "4.21.1", "vocab_size": 119547 } loading file https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/vocab.txt from cache at /home/runner/.cache/huggingface/transformers/28e5b750bf4f39cc620367720e105de1501cf36ec4ca7029eba82c1d2cc47caf.6c5b6600e968f4b5e08c86d8891ea99e51537fc2bf251435fb46922e8f7a7b29 loading file https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/tokenizer.json from cache at /home/runner/.cache/huggingface/transformers/5cbdf121f196be5f1016cb102b197b0c34009e1e658f513515f2eebef9f38093.b33e51591f94f17c238ee9b1fac75b96ff2678cbaed6e108feadb3449d18dc24 loading file https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/added_tokens.json from cache at None loading file https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/special_tokens_map.json from cache at None loading file https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/tokenizer_config.json from cache at /home/runner/.cache/huggingface/transformers/47087d99feeb3bc6184d7576ff089c52f7fbe3219fe48c6c4fa681e617753256.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f loading configuration file https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/config.json from cache at /home/runner/.cache/huggingface/transformers/cf37a9dc282a679f121734d06f003625d14cfdaf55c14358c4c0b8e7e2b89ac9.7a727bd85e40715bec919a39cdd6f0aba27a8cd488f2d4e0f512448dcd02bf0f Model config DistilBertConfig { "_name_or_path": "distilbert-base-multilingual-cased", "activation": "gelu", "architectures": [ "DistilBertForMaskedLM" ], "attention_dropout": 0.1, "dim": 768, "dropout": 0.1, "hidden_dim": 3072, "initializer_range": 0.02, "max_position_embeddings": 512, "model_type": "distilbert", "n_heads": 12, "n_layers": 6, "output_past": true, "pad_token_id": 0, "qa_dropout": 0.1, "seq_classif_dropout": 0.2, "sinusoidal_pos_embds": false, "tie_weights_": true, "transformers_version": "4.21.1", "vocab_size": 119547 }
PreTrainedTokenizerFast(name_or_path='distilbert-base-multilingual-cased', vocab_size=119547, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
model_config = transformers.AutoConfig.from_pretrained("distilbert-base-multilingual-cased")
loading configuration file https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/config.json from cache at /home/runner/.cache/huggingface/transformers/cf37a9dc282a679f121734d06f003625d14cfdaf55c14358c4c0b8e7e2b89ac9.7a727bd85e40715bec919a39cdd6f0aba27a8cd488f2d4e0f512448dcd02bf0f Model config DistilBertConfig { "_name_or_path": "distilbert-base-multilingual-cased", "activation": "gelu", "architectures": [ "DistilBertForMaskedLM" ], "attention_dropout": 0.1, "dim": 768, "dropout": 0.1, "hidden_dim": 3072, "initializer_range": 0.02, "max_position_embeddings": 512, "model_type": "distilbert", "n_heads": 12, "n_layers": 6, "output_past": true, "pad_token_id": 0, "qa_dropout": 0.1, "seq_classif_dropout": 0.2, "sinusoidal_pos_embds": false, "tie_weights_": true, "transformers_version": "4.21.1", "vocab_size": 119547 }
model = transformers.AutoModelForMaskedLM.from_config(model_config)
lm = transformers.pipeline("fill-mask", model=model, tokenizer=tokenizer)
lm(f"En France, c'est l'Université de {lm.tokenizer.mask_token} la meilleure.")
[{'score': 8.413863542955369e-05, 'token': 38068, 'token_str': '##ಯು', 'sequence': "En France, c'est l'Université deಯು la meilleure."}, {'score': 8.082830026978627e-05, 'token': 56443, 'token_str': '##tivu', 'sequence': "En France, c'est l'Université detivu la meilleure."}, {'score': 7.235651719383895e-05, 'token': 106744, 'token_str': 'veuve', 'sequence': "En France, c'est l'Université de veuve la meilleure."}, {'score': 6.998784374445677e-05, 'token': 64329, 'token_str': '##sluttet', 'sequence': "En France, c'est l'Université desluttet la meilleure."}, {'score': 6.898239371366799e-05, 'token': 51964, 'token_str': '##brir', 'sequence': "En France, c'est l'Université debrir la meilleure."}]
!mkdir -p local
!wget "https://sharedocs.huma-num.fr/wl/?id=LLYeokePZiJytROQ41iI3fkss6lMmGwd&fmode=download" -O local/ESLO_raw.txt
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) --2022-08-21 16:12:03-- https://sharedocs.huma-num.fr/wl/?id=LLYeokePZiJytROQ41iI3fkss6lMmGwd&fmode=download Resolving sharedocs.huma-num.fr (sharedocs.huma-num.fr)... 134.158.33.141 Connecting to sharedocs.huma-num.fr (sharedocs.huma-num.fr)|134.158.33.141|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 24159541 (23M) [text/plain] Saving to: ‘local/ESLO_raw.txt’ local/ESLO_raw.txt 100%[===================>] 23.04M 1.02MB/s in 24s 2022-08-21 16:12:28 (980 KB/s) - ‘local/ESLO_raw.txt’ saved [24159541/24159541]
!head local/ESLO_raw.txt
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) vous attaquez ? oui alors on on attaque et on attend monsieur NPERS on attend monsieur NPERS et j- je sais pas du tout si il va venir alors au cas où monsieur NPERS ne vienne pas euh on attaque sans lui hein attaquons attaquons attaquons attaquons euh ça s'appelle techniques euh je sais pas quoi modernes d'enseignement des langues vivantes audio-visuel et &audio-oral
raw_dataset = datasets.load_dataset("text", data_files=["local/ESLO_raw.txt"])
raw_dataset
Using custom data configuration default-c8f12ad55021ff66
Downloading and preparing dataset text/default to /home/runner/.cache/huggingface/datasets/text/default-c8f12ad55021ff66/0.0.0/21a506d1b2b34316b1e82d0bd79066905d846e5d7e619823c0dd338d6f1fa6ad...
Dataset text downloaded and prepared to /home/runner/.cache/huggingface/datasets/text/default-c8f12ad55021ff66/0.0.0/21a506d1b2b34316b1e82d0bd79066905d846e5d7e619823c0dd338d6f1fa6ad. Subsequent calls will reuse this data.
DatasetDict({ train: Dataset({ features: ['text'], num_rows: 802481 }) })
def tokenize_function(examples):
result = tokenizer(examples["text"], truncation=True)
return result
tokenized_dataset = raw_dataset.map(
tokenize_function, batched=True
)
tokenized_dataset
DatasetDict({ train: Dataset({ features: ['text', 'input_ids', 'attention_mask'], num_rows: 802481 }) })
data_collator = transformers.DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
tokenized_dataset["train"][0]
{'text': 'vous attaquez ?', 'input_ids': [101, 24931, 31583, 10305, 136, 102], 'attention_mask': [1, 1, 1, 1, 1, 1]}
samples = [tokenized_dataset["train"][i] for i in range(8)]
for chunk in data_collator(samples)["input_ids"]:
print(f"\n'>>> {tokenizer.decode(chunk)}'")
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) File /opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/site-packages/transformers/tokenization_utils_base.py:719, in BatchEncoding.convert_to_tensors(self, tensor_type, prepend_batch_axis) 718 if not is_tensor(value): --> 719 tensor = as_tensor(value) 721 # Removing this for now in favor of controlling the shape with `prepend_batch_axis` 722 # # at-least2d 723 # if tensor.ndim > 2: 724 # tensor = tensor.squeeze(0) 725 # elif tensor.ndim < 2: 726 # tensor = tensor[None, :] ValueError: too many dimensions 'str' During handling of the above exception, another exception occurred: ValueError Traceback (most recent call last) Input In [65], in <cell line: 3>() 1 samples = [tokenized_dataset["train"][i] for i in range(8)] ----> 3 for chunk in data_collator(samples)["input_ids"]: 4 print(f"\n'>>> {tokenizer.decode(chunk)}'") File /opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/site-packages/transformers/data/data_collator.py:42, in DataCollatorMixin.__call__(self, features, return_tensors) 40 return self.tf_call(features) 41 elif return_tensors == "pt": ---> 42 return self.torch_call(features) 43 elif return_tensors == "np": 44 return self.numpy_call(features) File /opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/site-packages/transformers/data/data_collator.py:729, in DataCollatorForLanguageModeling.torch_call(self, examples) 726 def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]: 727 # Handle dict or lists with proper padding and conversion to tensor. 728 if isinstance(examples[0], Mapping): --> 729 batch = self.tokenizer.pad(examples, return_tensors="pt", pad_to_multiple_of=self.pad_to_multiple_of) 730 else: 731 batch = { 732 "input_ids": _torch_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of) 733 } File /opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/site-packages/transformers/tokenization_utils_base.py:2916, in PreTrainedTokenizerBase.pad(self, encoded_inputs, padding, max_length, pad_to_multiple_of, return_attention_mask, return_tensors, verbose) 2913 batch_outputs[key] = [] 2914 batch_outputs[key].append(value) -> 2916 return BatchEncoding(batch_outputs, tensor_type=return_tensors) File /opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/site-packages/transformers/tokenization_utils_base.py:209, in BatchEncoding.__init__(self, data, encoding, tensor_type, prepend_batch_axis, n_sequences) 205 n_sequences = encoding[0].n_sequences 207 self._n_sequences = n_sequences --> 209 self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis) File /opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/site-packages/transformers/tokenization_utils_base.py:735, in BatchEncoding.convert_to_tensors(self, tensor_type, prepend_batch_axis) 730 if key == "overflowing_tokens": 731 raise ValueError( 732 "Unable to create tensor returning overflowing tokens of different lengths. " 733 "Please see if a fast version of this tokenizer is available to have this feature available." 734 ) --> 735 raise ValueError( 736 "Unable to create tensor, you should probably activate truncation and/or padding with" 737 " 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your" 738 f" features (`{key}` in this case) have excessive nesting (inputs type `list` where type `int` is" 739 " expected)." 740 ) 742 return self ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`text` in this case) have excessive nesting (inputs type `list` where type `int` is expected).
training_args = transformers.TrainingArguments(
gradient_accumulation_steps=2,
learning_rate=2e-5,
logging_steps=2,
max_steps=16,
output_dir=f"local/distilbert-ESLO",
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
report_to="tensorboard",
warmup_ratio=1/8,
weight_decay=0.01,
)
trainer = transformers.Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
data_collator=data_collator,
)
trainer.train()
PyTorch: setting up devices max_steps is given, it will override any value given in num_train_epochs The following columns in the training set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: text. If text are not expected by `DistilBertForMaskedLM.forward`, you can safely ignore this message. ***** Running training ***** Num examples = 802481 Num Epochs = 1 Instantaneous batch size per device = 4 Total train batch size (w. parallel, distributed & accumulation) = 8 Gradient Accumulation steps = 2 Total optimization steps = 16
Step | Training Loss |
---|---|
2 | 11.924400 |
4 | 11.826700 |
6 | 8.822500 |
8 | 11.639700 |
10 | 11.789100 |
12 | 11.481300 |
14 | 11.535800 |
16 | 11.418200 |
Training completed. Do not forget to share your model on huggingface.co/models =)
TrainOutput(global_step=16, training_loss=11.304720163345337, metrics={'train_runtime': 46.1362, 'train_samples_per_second': 2.774, 'train_steps_per_second': 0.347, 'total_flos': 546905123064.0, 'train_loss': 11.304720163345337, 'epoch': 0.0})
# speech_tokenizer = tokenizer.train_new_from_iterator(raw_dataset["train"], 8192)
tokenizer.tokenize("Je reconnais l'existence du kiwi")
['Je', 're', '##con', '##nais', 'l', "'", 'existence', 'du', 'ki', '##wi']
# speech_tokenizer.tokenize("Je reconnais l'existence du kiwi")
simpletransformers
unmasker = transformers.pipeline("fill-mask", model="bert-base-cased")
result = unmasker("This man works as a [MASK].")
print([r["token_str"] for r in result])
result = unmasker("This woman works as a [MASK].")
print([r["token_str"] for r in result])
result = unmasker("This person works as a [MASK].")
print([r["token_str"] for r in result])
https://huggingface.co/bert-base-cased/resolve/main/config.json not found in cache or force_download set to True, downloading to /home/runner/.cache/huggingface/transformers/tmp6k5xdgh9
storing https://huggingface.co/bert-base-cased/resolve/main/config.json in cache at /home/runner/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307 creating metadata file for /home/runner/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307 loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /home/runner/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307 Model config BertConfig { "_name_or_path": "bert-base-cased", "architectures": [ "BertForMaskedLM" ], "attention_probs_dropout_prob": 0.1, "classifier_dropout": null, "gradient_checkpointing": false, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 3072, "layer_norm_eps": 1e-12, "max_position_embeddings": 512, "model_type": "bert", "num_attention_heads": 12, "num_hidden_layers": 12, "pad_token_id": 0, "position_embedding_type": "absolute", "transformers_version": "4.21.1", "type_vocab_size": 2, "use_cache": true, "vocab_size": 28996 } loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /home/runner/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307 Model config BertConfig { "_name_or_path": "bert-base-cased", "architectures": [ "BertForMaskedLM" ], "attention_probs_dropout_prob": 0.1, "classifier_dropout": null, "gradient_checkpointing": false, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 3072, "layer_norm_eps": 1e-12, "max_position_embeddings": 512, "model_type": "bert", "num_attention_heads": 12, "num_hidden_layers": 12, "pad_token_id": 0, "position_embedding_type": "absolute", "transformers_version": "4.21.1", "type_vocab_size": 2, "use_cache": true, "vocab_size": 28996 } https://huggingface.co/bert-base-cased/resolve/main/pytorch_model.bin not found in cache or force_download set to True, downloading to /home/runner/.cache/huggingface/transformers/tmpz2__1xz5
storing https://huggingface.co/bert-base-cased/resolve/main/pytorch_model.bin in cache at /home/runner/.cache/huggingface/transformers/092cc582560fc3833e556b3f833695c26343cb54b7e88cd02d40821462a74999.1f48cab6c959fc6c360d22bea39d06959e90f5b002e77e836d2da45464875cda creating metadata file for /home/runner/.cache/huggingface/transformers/092cc582560fc3833e556b3f833695c26343cb54b7e88cd02d40821462a74999.1f48cab6c959fc6c360d22bea39d06959e90f5b002e77e836d2da45464875cda loading weights file https://huggingface.co/bert-base-cased/resolve/main/pytorch_model.bin from cache at /home/runner/.cache/huggingface/transformers/092cc582560fc3833e556b3f833695c26343cb54b7e88cd02d40821462a74999.1f48cab6c959fc6c360d22bea39d06959e90f5b002e77e836d2da45464875cda Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight'] - This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). All the weights of BertForMaskedLM were initialized from the model checkpoint at bert-base-cased. If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForMaskedLM for predictions without further training. https://huggingface.co/bert-base-cased/resolve/main/tokenizer_config.json not found in cache or force_download set to True, downloading to /home/runner/.cache/huggingface/transformers/tmpgbhv_tc3
storing https://huggingface.co/bert-base-cased/resolve/main/tokenizer_config.json in cache at /home/runner/.cache/huggingface/transformers/ec84e86ee39bfe112543192cf981deebf7e6cbe8c91b8f7f8f63c9be44366158.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f creating metadata file for /home/runner/.cache/huggingface/transformers/ec84e86ee39bfe112543192cf981deebf7e6cbe8c91b8f7f8f63c9be44366158.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /home/runner/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307 Model config BertConfig { "_name_or_path": "bert-base-cased", "architectures": [ "BertForMaskedLM" ], "attention_probs_dropout_prob": 0.1, "classifier_dropout": null, "gradient_checkpointing": false, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 3072, "layer_norm_eps": 1e-12, "max_position_embeddings": 512, "model_type": "bert", "num_attention_heads": 12, "num_hidden_layers": 12, "pad_token_id": 0, "position_embedding_type": "absolute", "transformers_version": "4.21.1", "type_vocab_size": 2, "use_cache": true, "vocab_size": 28996 } https://huggingface.co/bert-base-cased/resolve/main/vocab.txt not found in cache or force_download set to True, downloading to /home/runner/.cache/huggingface/transformers/tmpgphfv0g7
storing https://huggingface.co/bert-base-cased/resolve/main/vocab.txt in cache at /home/runner/.cache/huggingface/transformers/6508e60ab3c1200bffa26c95f4b58ac6b6d95fba4db1f195f632fa3cd7bc64cc.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791 creating metadata file for /home/runner/.cache/huggingface/transformers/6508e60ab3c1200bffa26c95f4b58ac6b6d95fba4db1f195f632fa3cd7bc64cc.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791 https://huggingface.co/bert-base-cased/resolve/main/tokenizer.json not found in cache or force_download set to True, downloading to /home/runner/.cache/huggingface/transformers/tmp32qece9a
storing https://huggingface.co/bert-base-cased/resolve/main/tokenizer.json in cache at /home/runner/.cache/huggingface/transformers/226a307193a9f4344264cdc76a12988448a25345ba172f2c7421f3b6810fddad.3dab63143af66769bbb35e3811f75f7e16b2320e12b7935e216bd6159ce6d9a6 creating metadata file for /home/runner/.cache/huggingface/transformers/226a307193a9f4344264cdc76a12988448a25345ba172f2c7421f3b6810fddad.3dab63143af66769bbb35e3811f75f7e16b2320e12b7935e216bd6159ce6d9a6 loading file https://huggingface.co/bert-base-cased/resolve/main/vocab.txt from cache at /home/runner/.cache/huggingface/transformers/6508e60ab3c1200bffa26c95f4b58ac6b6d95fba4db1f195f632fa3cd7bc64cc.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791 loading file https://huggingface.co/bert-base-cased/resolve/main/tokenizer.json from cache at /home/runner/.cache/huggingface/transformers/226a307193a9f4344264cdc76a12988448a25345ba172f2c7421f3b6810fddad.3dab63143af66769bbb35e3811f75f7e16b2320e12b7935e216bd6159ce6d9a6 loading file https://huggingface.co/bert-base-cased/resolve/main/added_tokens.json from cache at None loading file https://huggingface.co/bert-base-cased/resolve/main/special_tokens_map.json from cache at None loading file https://huggingface.co/bert-base-cased/resolve/main/tokenizer_config.json from cache at /home/runner/.cache/huggingface/transformers/ec84e86ee39bfe112543192cf981deebf7e6cbe8c91b8f7f8f63c9be44366158.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /home/runner/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307 Model config BertConfig { "_name_or_path": "bert-base-cased", "architectures": [ "BertForMaskedLM" ], "attention_probs_dropout_prob": 0.1, "classifier_dropout": null, "gradient_checkpointing": false, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 3072, "layer_norm_eps": 1e-12, "max_position_embeddings": 512, "model_type": "bert", "num_attention_heads": 12, "num_hidden_layers": 12, "pad_token_id": 0, "position_embedding_type": "absolute", "transformers_version": "4.21.1", "type_vocab_size": 2, "use_cache": true, "vocab_size": 28996 }
['lawyer', 'carpenter', 'doctor', 'waiter', 'mechanic'] ['nurse', 'waitress', 'teacher', 'maid', 'prostitute'] ['teacher', 'nurse', 'lawyer', 'doctor', 'waiter']
unmasker = transformers.pipeline("fill-mask", model="distilbert-base-multilingual-cased")
result = unmasker("This man works as a [MASK].")
print([r["token_str"] for r in result])
result = unmasker("This woman works as a [MASK].")
print([r["token_str"] for r in result])
result = unmasker("This person works as a [MASK].")
print([r["token_str"] for r in result])
loading configuration file https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/config.json from cache at /home/runner/.cache/huggingface/transformers/cf37a9dc282a679f121734d06f003625d14cfdaf55c14358c4c0b8e7e2b89ac9.7a727bd85e40715bec919a39cdd6f0aba27a8cd488f2d4e0f512448dcd02bf0f Model config DistilBertConfig { "_name_or_path": "distilbert-base-multilingual-cased", "activation": "gelu", "architectures": [ "DistilBertForMaskedLM" ], "attention_dropout": 0.1, "dim": 768, "dropout": 0.1, "hidden_dim": 3072, "initializer_range": 0.02, "max_position_embeddings": 512, "model_type": "distilbert", "n_heads": 12, "n_layers": 6, "output_past": true, "pad_token_id": 0, "qa_dropout": 0.1, "seq_classif_dropout": 0.2, "sinusoidal_pos_embds": false, "tie_weights_": true, "transformers_version": "4.21.1", "vocab_size": 119547 } loading configuration file https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/config.json from cache at /home/runner/.cache/huggingface/transformers/cf37a9dc282a679f121734d06f003625d14cfdaf55c14358c4c0b8e7e2b89ac9.7a727bd85e40715bec919a39cdd6f0aba27a8cd488f2d4e0f512448dcd02bf0f Model config DistilBertConfig { "_name_or_path": "distilbert-base-multilingual-cased", "activation": "gelu", "architectures": [ "DistilBertForMaskedLM" ], "attention_dropout": 0.1, "dim": 768, "dropout": 0.1, "hidden_dim": 3072, "initializer_range": 0.02, "max_position_embeddings": 512, "model_type": "distilbert", "n_heads": 12, "n_layers": 6, "output_past": true, "pad_token_id": 0, "qa_dropout": 0.1, "seq_classif_dropout": 0.2, "sinusoidal_pos_embds": false, "tie_weights_": true, "transformers_version": "4.21.1", "vocab_size": 119547 } loading weights file https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/pytorch_model.bin from cache at /home/runner/.cache/huggingface/transformers/7b48683e2e7ba71cd1d7d6551ac325eceee01db5c2f3e81cfbfd1ee7bb7877f2.c24097b0cf91dbc66977325325fd03112f0f13d0e3579abbffc8d1e45f8d0619 All model checkpoint weights were used when initializing DistilBertForMaskedLM. All the weights of DistilBertForMaskedLM were initialized from the model checkpoint at distilbert-base-multilingual-cased. If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForMaskedLM for predictions without further training. loading configuration file https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/config.json from cache at /home/runner/.cache/huggingface/transformers/cf37a9dc282a679f121734d06f003625d14cfdaf55c14358c4c0b8e7e2b89ac9.7a727bd85e40715bec919a39cdd6f0aba27a8cd488f2d4e0f512448dcd02bf0f Model config DistilBertConfig { "_name_or_path": "distilbert-base-multilingual-cased", "activation": "gelu", "architectures": [ "DistilBertForMaskedLM" ], "attention_dropout": 0.1, "dim": 768, "dropout": 0.1, "hidden_dim": 3072, "initializer_range": 0.02, "max_position_embeddings": 512, "model_type": "distilbert", "n_heads": 12, "n_layers": 6, "output_past": true, "pad_token_id": 0, "qa_dropout": 0.1, "seq_classif_dropout": 0.2, "sinusoidal_pos_embds": false, "tie_weights_": true, "transformers_version": "4.21.1", "vocab_size": 119547 } loading file https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/vocab.txt from cache at /home/runner/.cache/huggingface/transformers/28e5b750bf4f39cc620367720e105de1501cf36ec4ca7029eba82c1d2cc47caf.6c5b6600e968f4b5e08c86d8891ea99e51537fc2bf251435fb46922e8f7a7b29 loading file https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/tokenizer.json from cache at /home/runner/.cache/huggingface/transformers/5cbdf121f196be5f1016cb102b197b0c34009e1e658f513515f2eebef9f38093.b33e51591f94f17c238ee9b1fac75b96ff2678cbaed6e108feadb3449d18dc24 loading file https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/added_tokens.json from cache at None loading file https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/special_tokens_map.json from cache at None loading file https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/tokenizer_config.json from cache at /home/runner/.cache/huggingface/transformers/47087d99feeb3bc6184d7576ff089c52f7fbe3219fe48c6c4fa681e617753256.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f loading configuration file https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/config.json from cache at /home/runner/.cache/huggingface/transformers/cf37a9dc282a679f121734d06f003625d14cfdaf55c14358c4c0b8e7e2b89ac9.7a727bd85e40715bec919a39cdd6f0aba27a8cd488f2d4e0f512448dcd02bf0f Model config DistilBertConfig { "_name_or_path": "distilbert-base-multilingual-cased", "activation": "gelu", "architectures": [ "DistilBertForMaskedLM" ], "attention_dropout": 0.1, "dim": 768, "dropout": 0.1, "hidden_dim": 3072, "initializer_range": 0.02, "max_position_embeddings": 512, "model_type": "distilbert", "n_heads": 12, "n_layers": 6, "output_past": true, "pad_token_id": 0, "qa_dropout": 0.1, "seq_classif_dropout": 0.2, "sinusoidal_pos_embds": false, "tie_weights_": true, "transformers_version": "4.21.1", "vocab_size": 119547 }
['painter', 'teacher', 'man', 'child', 'priest'] ['teacher', 'painter', 'lawyer', '##½', 'child'] ['teacher', 'painter', 'priest', 'writer', 'poet']