{ "version": "1.0", "truncation": null, "padding": null, "added_tokens": [ { "id": 0, "content": "[UNK]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 1, "content": "[PAD]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 2, "content": "[CLS]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 3, "content": "[SEP]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 4, "content": "[MASK]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true } ], "normalizer": { "type": "Lowercase" }, "pre_tokenizer": { "type": "Whitespace" }, "post_processor": { "type": "TemplateProcessing", "single": [ { "Sequence": { "id": "A", "type_id": 0 } } ], "pair": [ { "Sequence": { "id": "A", "type_id": 0 } }, { "Sequence": { "id": "B", "type_id": 1 } } ], "special_tokens": {} }, "decoder": null, "model": { "type": "BPE", "dropout": null, "unk_token": "[UNK]", "continuing_subword_prefix": null, "end_of_word_suffix": null, "fuse_unk": false, "byte_fallback": false, "ignore_merges": false, "vocab": { "[UNK]": 0, "[PAD]": 1, "[CLS]": 2, "[SEP]": 3, "[MASK]": 4, "a": 5, "b": 6, "c": 7, "d": 8, "e": 9, "f": 10, "g": 11, "h": 12, "i": 13, "k": 14, "l": 15, "m": 16, "n": 17, "o": 18, "r": 19, "s": 20, "t": 21, "u": 22, "v": 23, "z": 24, "en": 25, "is": 26, "st": 27, "te": 28, "ar": 29, "ce": 30, "ch": 31, "ct": 32, "er": 33, "in": 34, "om": 35, "to": 36, "ing": 37, "ab": 38, "an": 39, "at": 40, "ate": 41, "ace": 42, "bu": 43, "cl": 44, "cr": 45, "cu": 46, "cab": 47, "ding": 48, "ear": 49, "fr": 50, "face": 51, "gg": 52, "gen": 53, "hu": 54, "his": 55, "il": 56, "iz": 57, "ite": 58, "ict": 59, "ken": 60, "ocab": 61, "re": 62, "rict": 63, "sen": 64, "scr": 65, "ten": 66, "this": 67, "ure": 68, "vocab": 69, "stom": 70, "strict": 71, "test": 72, "arch": 73, "cture": 74, "erate": 75, "token": 76, "and": 77, "atch": 78, "buil": 79, "clear": 80, "custom": 81, "from": 82, "gging": 83, "generate": 84, "hugging": 85, "izer": 86, "itecture": 87, "senten": 88, "scratch": 89, "architecture": 90, "tokenizer": 91, "building": 92, "sentence": 93 }, "merges": [ [ "e", "n" ], [ "i", "s" ], [ "s", "t" ], [ "t", "e" ], [ "a", "r" ], [ "c", "e" ], [ "c", "h" ], [ "c", "t" ], [ "e", "r" ], [ "i", "n" ], [ "o", "m" ], [ "t", "o" ], [ "in", "g" ], [ "a", "b" ], [ "a", "n" ], [ "a", "t" ], [ "a", "te" ], [ "a", "ce" ], [ "b", "u" ], [ "c", "l" ], [ "c", "r" ], [ "c", "u" ], [ "c", "ab" ], [ "d", "ing" ], [ "e", "ar" ], [ "f", "r" ], [ "f", "ace" ], [ "g", "g" ], [ "g", "en" ], [ "h", "u" ], [ "h", "is" ], [ "i", "l" ], [ "i", "z" ], [ "i", "te" ], [ "i", "ct" ], [ "k", "en" ], [ "o", "cab" ], [ "r", "e" ], [ "r", "ict" ], [ "s", "en" ], [ "s", "cr" ], [ "t", "en" ], [ "t", "his" ], [ "u", "re" ], [ "v", "ocab" ], [ "st", "om" ], [ "st", "rict" ], [ "te", "st" ], [ "ar", "ch" ], [ "ct", "ure" ], [ "er", "ate" ], [ "to", "ken" ], [ "an", "d" ], [ "at", "ch" ], [ "bu", "il" ], [ "cl", "ear" ], [ "cu", "stom" ], [ "fr", "om" ], [ "gg", "ing" ], [ "gen", "erate" ], [ "hu", "gging" ], [ "iz", "er" ], [ "ite", "cture" ], [ "sen", "ten" ], [ "scr", "atch" ], [ "arch", "itecture" ], [ "token", "izer" ], [ "buil", "ding" ], [ "senten", "ce" ] ] } }