From 72911c893ac26e201c9f88d534fc610b4639343a Mon Sep 17 00:00:00 2001 From: Cahya Wirawan Date: Mon, 17 Aug 2020 09:42:25 +0200 Subject: [PATCH] Create model cards for indonesian models (#6522) * added model cards for indonesian gpt2-small, bert-base and roberta-base models * removed bibtex entries --- .../cahya/bert-base-indonesian-522M/README.md | 73 +++++++++++++++++++ .../gpt2-small-indonesian-522M/README.md | 64 ++++++++++++++++ .../roberta-base-indonesian-522M/README.md | 58 +++++++++++++++ 3 files changed, 195 insertions(+) create mode 100644 model_cards/cahya/bert-base-indonesian-522M/README.md create mode 100644 model_cards/cahya/gpt2-small-indonesian-522M/README.md create mode 100644 model_cards/cahya/roberta-base-indonesian-522M/README.md diff --git a/model_cards/cahya/bert-base-indonesian-522M/README.md b/model_cards/cahya/bert-base-indonesian-522M/README.md new file mode 100644 index 000000000..6008e770a --- /dev/null +++ b/model_cards/cahya/bert-base-indonesian-522M/README.md @@ -0,0 +1,73 @@ +--- +language: "id" +license: "mit" +datasets: +- Indonesian Wikipedia +widget: +- text: "Ibu ku sedang bekerja [MASK] supermarket." +--- + +# Indonesian BERT base model (uncased) + +## Model description +It is BERT-base model pre-trained with indonesian Wikipedia using a masked language modeling (MLM) objective. This +model is uncased: it does not make a difference between indonesia and Indonesia. + +This is one of several other language models that have been pre-trained with indonesian datasets. More detail about +its usage on downstream tasks (text classification, text generation, etc) is available at [Transformer based Indonesian Language Models](https://github.com/cahya-wirawan/indonesian-language-models/tree/master/Transformers) + +## Intended uses & limitations + +### How to use +You can use this model directly with a pipeline for masked language modeling: +```python +>>> from transformers import pipeline +>>> unmasker = pipeline('fill-mask', model='cahya/bert-base-indonesian-522M') +>>> unmasker("Ibu ku sedang bekerja [MASK] supermarket") + +[{'sequence': '[CLS] ibu ku sedang bekerja di supermarket [SEP]', + 'score': 0.7983310222625732, + 'token': 1495}, + {'sequence': '[CLS] ibu ku sedang bekerja. supermarket [SEP]', + 'score': 0.090003103017807, + 'token': 17}, + {'sequence': '[CLS] ibu ku sedang bekerja sebagai supermarket [SEP]', + 'score': 0.025469014421105385, + 'token': 1600}, + {'sequence': '[CLS] ibu ku sedang bekerja dengan supermarket [SEP]', + 'score': 0.017966199666261673, + 'token': 1555}, + {'sequence': '[CLS] ibu ku sedang bekerja untuk supermarket [SEP]', + 'score': 0.016971781849861145, + 'token': 1572}] +``` +Here is how to use this model to get the features of a given text in PyTorch: +```python +from transformers import BertTokenizer, BertModel + +model_name='cahya/bert-base-indonesian-522M' +tokenizer = BertTokenizer.from_pretrained(model_name) +model = BertModel.from_pretrained(model_name) +text = "Silakan diganti dengan text apa saja." +encoded_input = tokenizer(text, return_tensors='pt') +output = model(**encoded_input) +``` +and in Tensorflow: +```python +from transformers import BertTokenizer, TFBertModel + +model_name='cahya/bert-base-indonesian-522M' +tokenizer = BertTokenizer.from_pretrained(model_name) +model = TFBertModel.from_pretrained(model_name) +text = "Silakan diganti dengan text apa saja." +encoded_input = tokenizer(text, return_tensors='tf') +output = model(encoded_input) +``` + +## Training data + +This model was pre-trained with 522MB of indonesian Wikipedia. +The texts are lowercased and tokenized using WordPiece and a vocabulary size of 32,000. The inputs of the model are +then of the form: + +```[CLS] Sentence A [SEP] Sentence B [SEP]``` diff --git a/model_cards/cahya/gpt2-small-indonesian-522M/README.md b/model_cards/cahya/gpt2-small-indonesian-522M/README.md new file mode 100644 index 000000000..cfd991839 --- /dev/null +++ b/model_cards/cahya/gpt2-small-indonesian-522M/README.md @@ -0,0 +1,64 @@ +--- +language: "id" +license: "mit" +datasets: +- Indonesian Wikipedia +widget: +- text: "Pulau Dewata sering dikunjungi" +--- + +# Indonesian GPT2 small model + +## Model description +It is GPT2-small model pre-trained with indonesian Wikipedia using a causal language modeling (CLM) objective. This +model is uncased: it does not make a difference between indonesia and Indonesia. + +This is one of several other language models that have been pre-trained with indonesian datasets. More detail about +its usage on downstream tasks (text classification, text generation, etc) is available at [Transformer based Indonesian Language Models](https://github.com/cahya-wirawan/indonesian-language-models/tree/master/Transformers) + +## Intended uses & limitations + +### How to use +You can use this model directly with a pipeline for text generation. Since the generation relies on some randomness, +we set a seed for reproducibility: +```python +>>> from transformers import pipeline, set_seed +>>> generator = pipeline('text-generation', model='cahya/gpt2-small-indonesian-522M') +>>> set_seed(42) +>>> generator("Kerajaan Majapahit adalah", max_length=30, num_return_sequences=5, num_beams=10) + +[{'generated_text': 'Kerajaan Majapahit adalah sebuah kerajaan yang pernah berdiri di Jawa Timur pada abad ke-14 hingga abad ke-15. Kerajaan ini berdiri pada abad ke-14'}, +{'generated_text': 'Kerajaan Majapahit adalah sebuah kerajaan yang pernah berdiri di Jawa Timur pada abad ke-14 hingga abad ke-16. Kerajaan ini berdiri pada abad ke-14'}, +{'generated_text': 'Kerajaan Majapahit adalah sebuah kerajaan yang pernah berdiri di Jawa Timur pada abad ke-14 hingga abad ke-15. Kerajaan ini berdiri pada abad ke-15'}, +{'generated_text': 'Kerajaan Majapahit adalah sebuah kerajaan yang pernah berdiri di Jawa Timur pada abad ke-14 hingga abad ke-16. Kerajaan ini berdiri pada abad ke-15'}, +{'generated_text': 'Kerajaan Majapahit adalah sebuah kerajaan yang pernah berdiri di Jawa Timur pada abad ke-14 hingga abad ke-15. Kerajaan ini merupakan kelanjutan dari Kerajaan Majapahit yang'}] + +``` +Here is how to use this model to get the features of a given text in PyTorch: +```python +from transformers import GPT2Tokenizer, GPT2Model + +model_name='cahya/gpt2-small-indonesian-522M' +tokenizer = GPT2Tokenizer.from_pretrained(model_name) +model = GPT2Model.from_pretrained(model_name) +text = "Silakan diganti dengan text apa saja." +encoded_input = tokenizer(text, return_tensors='pt') +output = model(**encoded_input) +``` +and in Tensorflow: +```python +from transformers import GPT2Tokenizer, TFGPT2Model + +model_name='cahya/gpt2-small-indonesian-522M' +tokenizer = GPT2Tokenizer.from_pretrained(model_name) +model = TFGPT2Model.from_pretrained(model_name) +text = "Silakan diganti dengan text apa saja." +encoded_input = tokenizer(text, return_tensors='tf') +output = model(encoded_input) +``` + +## Training data + +This model was pre-trained with 522MB of indonesian Wikipedia. +The texts are tokenized using a byte-level version of Byte Pair Encoding (BPE) (for unicode characters) and +a vocabulary size of 52,000. The inputs are sequences of 128 consecutive tokens. diff --git a/model_cards/cahya/roberta-base-indonesian-522M/README.md b/model_cards/cahya/roberta-base-indonesian-522M/README.md new file mode 100644 index 000000000..84e2aa210 --- /dev/null +++ b/model_cards/cahya/roberta-base-indonesian-522M/README.md @@ -0,0 +1,58 @@ +--- +language: "id" +license: "mit" +datasets: +- Indonesian Wikipedia +widget: +- text: "Ibu ku sedang bekerja supermarket." +--- + +# Indonesian RoBERTa base model (uncased) + +## Model description +It is RoBERTa-base model pre-trained with indonesian Wikipedia using a masked language modeling (MLM) objective. This +model is uncased: it does not make a difference between indonesia and Indonesia. + +This is one of several other language models that have been pre-trained with indonesian datasets. More detail about +its usage on downstream tasks (text classification, text generation, etc) is available at [Transformer based Indonesian Language Models](https://github.com/cahya-wirawan/indonesian-language-models/tree/master/Transformers) + +## Intended uses & limitations + +### How to use +You can use this model directly with a pipeline for masked language modeling: +```python +>>> from transformers import pipeline +>>> unmasker = pipeline('fill-mask', model='cahya/roberta-base-indonesian-522M') +>>> unmasker("Ibu ku sedang bekerja supermarket") + +``` +Here is how to use this model to get the features of a given text in PyTorch: +```python +from transformers import RobertaTokenizer, RobertaModel + +model_name='cahya/roberta-base-indonesian-522M' +tokenizer = RobertaTokenizer.from_pretrained(model_name) +model = RobertaModel.from_pretrained(model_name) +text = "Silakan diganti dengan text apa saja." +encoded_input = tokenizer(text, return_tensors='pt') +output = model(**encoded_input) +``` +and in Tensorflow: +```python +from transformers import RobertaTokenizer, TFRobertaModel + +model_name='cahya/roberta-base-indonesian-522M' +tokenizer = RobertaTokenizer.from_pretrained(model_name) +model = TFRobertaModel.from_pretrained(model_name) +text = "Silakan diganti dengan text apa saja." +encoded_input = tokenizer(text, return_tensors='tf') +output = model(encoded_input) +``` + +## Training data + +This model was pre-trained with 522MB of indonesian Wikipedia. +The texts are lowercased and tokenized using WordPiece and a vocabulary size of 32,000. The inputs of the model are +then of the form: + +``` Sentence A Sentence B ```