firstTestModel / tokenizer.json
go76dof's picture
feat: upload custom trained BPE tokenizer from scratch
fcdbc8e verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "Lowercase"
},
"pre_tokenizer": {
"type": "Whitespace"
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
}
],
"pair": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
}
],
"special_tokens": {}
},
"decoder": null,
"model": {
"type": "BPE",
"dropout": null,
"unk_token": "[UNK]",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"ignore_merges": false,
"vocab": {
"[UNK]": 0,
"[PAD]": 1,
"[CLS]": 2,
"[SEP]": 3,
"[MASK]": 4,
"a": 5,
"b": 6,
"c": 7,
"d": 8,
"e": 9,
"f": 10,
"g": 11,
"h": 12,
"i": 13,
"k": 14,
"l": 15,
"m": 16,
"n": 17,
"o": 18,
"r": 19,
"s": 20,
"t": 21,
"u": 22,
"v": 23,
"z": 24,
"en": 25,
"is": 26,
"st": 27,
"te": 28,
"ar": 29,
"ce": 30,
"ch": 31,
"ct": 32,
"er": 33,
"in": 34,
"om": 35,
"to": 36,
"ing": 37,
"ab": 38,
"an": 39,
"at": 40,
"ate": 41,
"ace": 42,
"bu": 43,
"cl": 44,
"cr": 45,
"cu": 46,
"cab": 47,
"ding": 48,
"ear": 49,
"fr": 50,
"face": 51,
"gg": 52,
"gen": 53,
"hu": 54,
"his": 55,
"il": 56,
"iz": 57,
"ite": 58,
"ict": 59,
"ken": 60,
"ocab": 61,
"re": 62,
"rict": 63,
"sen": 64,
"scr": 65,
"ten": 66,
"this": 67,
"ure": 68,
"vocab": 69,
"stom": 70,
"strict": 71,
"test": 72,
"arch": 73,
"cture": 74,
"erate": 75,
"token": 76,
"and": 77,
"atch": 78,
"buil": 79,
"clear": 80,
"custom": 81,
"from": 82,
"gging": 83,
"generate": 84,
"hugging": 85,
"izer": 86,
"itecture": 87,
"senten": 88,
"scratch": 89,
"architecture": 90,
"tokenizer": 91,
"building": 92,
"sentence": 93
},
"merges": [
[
"e",
"n"
],
[
"i",
"s"
],
[
"s",
"t"
],
[
"t",
"e"
],
[
"a",
"r"
],
[
"c",
"e"
],
[
"c",
"h"
],
[
"c",
"t"
],
[
"e",
"r"
],
[
"i",
"n"
],
[
"o",
"m"
],
[
"t",
"o"
],
[
"in",
"g"
],
[
"a",
"b"
],
[
"a",
"n"
],
[
"a",
"t"
],
[
"a",
"te"
],
[
"a",
"ce"
],
[
"b",
"u"
],
[
"c",
"l"
],
[
"c",
"r"
],
[
"c",
"u"
],
[
"c",
"ab"
],
[
"d",
"ing"
],
[
"e",
"ar"
],
[
"f",
"r"
],
[
"f",
"ace"
],
[
"g",
"g"
],
[
"g",
"en"
],
[
"h",
"u"
],
[
"h",
"is"
],
[
"i",
"l"
],
[
"i",
"z"
],
[
"i",
"te"
],
[
"i",
"ct"
],
[
"k",
"en"
],
[
"o",
"cab"
],
[
"r",
"e"
],
[
"r",
"ict"
],
[
"s",
"en"
],
[
"s",
"cr"
],
[
"t",
"en"
],
[
"t",
"his"
],
[
"u",
"re"
],
[
"v",
"ocab"
],
[
"st",
"om"
],
[
"st",
"rict"
],
[
"te",
"st"
],
[
"ar",
"ch"
],
[
"ct",
"ure"
],
[
"er",
"ate"
],
[
"to",
"ken"
],
[
"an",
"d"
],
[
"at",
"ch"
],
[
"bu",
"il"
],
[
"cl",
"ear"
],
[
"cu",
"stom"
],
[
"fr",
"om"
],
[
"gg",
"ing"
],
[
"gen",
"erate"
],
[
"hu",
"gging"
],
[
"iz",
"er"
],
[
"ite",
"cture"
],
[
"sen",
"ten"
],
[
"scr",
"atch"
],
[
"arch",
"itecture"
],
[
"token",
"izer"
],
[
"buil",
"ding"
],
[
"senten",
"ce"
]
]
}
}