{
  "_id": "69df4803c5259ad4cd8dfe02",
  "Package": "tokenizers",
  "Type": "Package",
  "Title": "Fast, Consistent Tokenization of Natural Language Text",
  "Version": "0.3.1",
  "Date": "2024-03-27",
  "Description": "Convert natural language text into tokens. Includes\ntokenizers for shingled n-grams, skip n-grams, words, word\nstems, sentences, paragraphs, characters, shingled characters,\nlines, Penn Treebank, regular expressions, as well as functions\nfor counting characters, words, and sentences, and a function\nfor splitting longer texts into separate documents, each with\nthe same number of words.  The tokenizers have a consistent\ninterface, and the package is built on the 'stringi' and 'Rcpp'\npackages for fast yet correct tokenization in 'UTF-8'.",
  "License": "MIT + file LICENSE",
  "LazyData": "yes",
  "Authors@R": "c(person(\"Thomas\", \"Charlon\", role = c(\"aut\", \"cre\"),\nemail = \"charlon@protonmail.com\",\ncomment = c(ORCID = \"0000-0001-7497-0470\")),\nperson(\"Lincoln\", \"Mullen\", role = c(\"aut\"),\nemail = \"lincoln@lincolnmullen.com\",\ncomment = c(ORCID = \"0000-0001-5103-6917\")),\nperson(\"Os\", \"Keyes\", role = c(\"ctb\"),\nemail = \"ironholds@gmail.com\",\ncomment = c(ORCID = \"0000-0001-5196-609X\")),\nperson(\"Dmitriy\", \"Selivanov\", role = c(\"ctb\"),\nemail = \"selivanov.dmitriy@gmail.com\"),\nperson(\"Jeffrey\", \"Arnold\", role = c(\"ctb\"),\nemail = \"jeffrey.arnold@gmail.com\",\ncomment = c(ORCID = \"0000-0001-9953-3904\")),\nperson(\"Kenneth\", \"Benoit\", role = c(\"ctb\"),\nemail = \"kbenoit@lse.ac.uk\",\ncomment = c(ORCID = \"0000-0002-0797-564X\")))",
  "URL": "https://docs.ropensci.org/tokenizers/,\nhttps://github.com/ropensci/tokenizers",
  "BugReports": "https://github.com/ropensci/tokenizers/issues",
  "RoxygenNote": "7.3.1",
  "Encoding": "UTF-8",
  "VignetteBuilder": "knitr",
  "Config/pak/sysreqs": "libicu-dev",
  "Repository": "https://ropensci.r-universe.dev",
  "Date/Publication": "2024-03-27 09:33:34 UTC",
  "RemoteUrl": "https://github.com/ropensci/tokenizers",
  "RemoteRef": "master",
  "RemoteSha": "b80863d088d4b39695b602ca11e061ac34770ec7",
  "NeedsCompilation": "yes",
  "Packaged": {
    "Date": "2026-04-15 08:05:05 UTC",
    "User": "root"
  },
  "Author": "Thomas Charlon [aut, cre] (ORCID:\n<https://orcid.org/0000-0001-7497-0470>),\nLincoln Mullen [aut] (ORCID: <https://orcid.org/0000-0001-5103-6917>),\nOs Keyes [ctb] (ORCID: <https://orcid.org/0000-0001-5196-609X>),\nDmitriy Selivanov [ctb],\nJeffrey Arnold [ctb] (ORCID: <https://orcid.org/0000-0001-9953-3904>),\nKenneth Benoit [ctb] (ORCID: <https://orcid.org/0000-0002-0797-564X>)",
  "Maintainer": "Thomas Charlon <charlon@protonmail.com>",
  "MD5sum": "69c356d844511eb4edad8fe24731506d",
  "_user": "ropensci",
  "_type": "src",
  "_file": "tokenizers_0.3.1.tar.gz",
  "_fileid": "0be76b33f449a15c7c0ab79f0650a1b5eaeec0ad80033d8822166683e285137c",
  "_filesize": 576593,
  "_sha256": "0be76b33f449a15c7c0ab79f0650a1b5eaeec0ad80033d8822166683e285137c",
  "_created": "2026-04-15T08:05:05.000Z",
  "_published": "2026-04-15T08:10:43.274Z",
  "_distro": "noble",
  "_jobs": [
    {
      "job": 71413811256,
      "time": 145,
      "config": "linux-devel-arm64",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "6445795297"
    },
    {
      "job": 71413811232,
      "time": 138,
      "config": "linux-devel-x86_64",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "6445793111"
    },
    {
      "job": 71413811280,
      "time": 138,
      "config": "linux-release-arm64",
      "r": "4.5.3",
      "check": "OK",
      "artifact": "6445793246"
    },
    {
      "job": 71413811279,
      "time": 137,
      "config": "linux-release-x86_64",
      "r": "4.5.3",
      "check": "OK",
      "artifact": "6445792882"
    },
    {
      "job": 71413811397,
      "time": 96,
      "config": "macos-oldrel-arm64",
      "r": "4.5.3",
      "check": "ERROR",
      "artifact": "6445781139"
    },
    {
      "job": 71413811269,
      "time": 259,
      "config": "macos-oldrel-x86_64",
      "r": "4.5.3",
      "check": "OK",
      "artifact": "6445826925"
    },
    {
      "job": 71413811264,
      "time": 97,
      "config": "macos-release-arm64",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "6445781522"
    },
    {
      "job": 71413811283,
      "time": 291,
      "config": "macos-release-x86_64",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "6445836504"
    },
    {
      "job": 71413441729,
      "time": 159,
      "config": "pkgdown",
      "r": "4.5.2",
      "check": "OK",
      "artifact": "6445752188"
    },
    {
      "job": 71413441734,
      "time": 169,
      "config": "source",
      "r": "4.5.3",
      "check": "OK",
      "artifact": "6445755297"
    },
    {
      "job": 71413811220,
      "time": 116,
      "config": "wasm-release",
      "r": "4.5.1",
      "check": "OK",
      "artifact": "6445786854"
    },
    {
      "job": 71413811226,
      "time": 110,
      "config": "windows-devel",
      "r": "4.7.0",
      "check": "OK",
      "artifact": "6445785476"
    },
    {
      "job": 71413811244,
      "time": 95,
      "config": "windows-oldrel",
      "r": "4.5.3",
      "check": "OK",
      "artifact": "6445781123"
    },
    {
      "job": 71413811252,
      "time": 113,
      "config": "windows-release",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "6445786296"
    }
  ],
  "_buildurl": "https://github.com/r-universe/ropensci/actions/runs/24443324861",
  "_status": "success",
  "_host": "GitHub-Actions",
  "_upstream": "https://github.com/ropensci/tokenizers",
  "_commit": {
    "id": "b80863d088d4b39695b602ca11e061ac34770ec7",
    "author": "tcharlon <charlon@protonmail.com>",
    "committer": "tcharlon <charlon@protonmail.com>",
    "message": "new maintainer\n",
    "time": 1711532014
  },
  "_maintainer": {
    "name": "Thomas Charlon",
    "email": "charlon@protonmail.com",
    "login": "thomaschln",
    "linkedin": "in/thomas-charlon-meng-phd-aba0a3275",
    "orcid": "0000-0001-7497-0470",
    "uuid": 2394508
  },
  "_registered": true,
  "_dependencies": [
    {
      "package": "R",
      "version": ">= 3.1.3",
      "role": "Depends"
    },
    {
      "package": "Rcpp",
      "role": "LinkingTo"
    },
    {
      "package": "stringi",
      "version": ">= 1.0.1",
      "role": "Imports"
    },
    {
      "package": "Rcpp",
      "version": ">= 0.12.3",
      "role": "Imports"
    },
    {
      "package": "SnowballC",
      "version": ">= 0.5.1",
      "role": "Imports"
    },
    {
      "package": "covr",
      "role": "Suggests"
    },
    {
      "package": "knitr",
      "role": "Suggests"
    },
    {
      "package": "rmarkdown",
      "role": "Suggests"
    },
    {
      "package": "stopwords",
      "version": ">= 0.9.0",
      "role": "Suggests"
    },
    {
      "package": "testthat",
      "role": "Suggests"
    }
  ],
  "_owner": "ropensci",
  "_selfowned": true,
  "_usedby": 77,
  "_updates": [],
  "_tags": [],
  "_topics": [
    "nlp",
    "peer-reviewed",
    "text-mining",
    "tokenizer",
    "cpp"
  ],
  "_stars": 188,
  "_contributors": [
    {
      "user": "lmullen",
      "count": 175,
      "uuid": 183672
    },
    {
      "user": "dselivanov",
      "count": 6,
      "uuid": 5123805
    },
    {
      "user": "kbenoit",
      "count": 4,
      "uuid": 2182246
    },
    {
      "user": "jrnold",
      "count": 4,
      "uuid": 123968
    },
    {
      "user": "chrismuir",
      "count": 1,
      "uuid": 13386824
    },
    {
      "user": "emilhvitfeldt",
      "count": 1,
      "uuid": 14034784
    },
    {
      "user": "hideaki",
      "count": 1,
      "uuid": 19518
    },
    {
      "user": "jeroen",
      "count": 1,
      "uuid": 216319
    },
    {
      "user": "juliasilge",
      "count": 1,
      "uuid": 12505835
    },
    {
      "user": "karthik",
      "count": 1,
      "uuid": 138494
    },
    {
      "user": "maelle",
      "count": 1,
      "uuid": 8360597
    },
    {
      "user": "ironholds",
      "count": 1,
      "uuid": 2487262
    },
    {
      "user": "thomaschln",
      "count": 1,
      "uuid": 2394508
    }
  ],
  "_userbio": {
    "uuid": 1200269,
    "type": "organization",
    "name": "rOpenSci",
    "description": "Tools and R Packages for Open Science"
  },
  "_downloads": {
    "count": 40813,
    "source": "https://cranlogs.r-pkg.org/downloads/total/last-month/tokenizers"
  },
  "_mentions": 1,
  "_devurl": "https://github.com/ropensci/tokenizers",
  "_pkgdown": "https://docs.ropensci.org/tokenizers/",
  "_searchresults": 1116,
  "_metadata": {
    "review": {
      "id": 33,
      "status": "reviewed",
      "version": "0.1.1",
      "organization": "rOpenSci Software Review",
      "url": "https://github.com/ropensci/software-review/issues/33"
    },
    "ropensci_category": "scalereprod"
  },
  "_rbuild": "4.5.3",
  "_assets": [
    "extra/citation.cff",
    "extra/citation.html",
    "extra/citation.json",
    "extra/citation.txt",
    "extra/contents.json",
    "extra/NEWS.html",
    "extra/NEWS.txt",
    "extra/readme.html",
    "extra/readme.md",
    "extra/tokenizers.html",
    "manual.pdf"
  ],
  "_homeurl": "https://github.com/ropensci/tokenizers",
  "_realowner": "ropensci",
  "_cranurl": true,
  "_releases": [
    {
      "version": "0.1.0",
      "date": "2016-04-02"
    },
    {
      "version": "0.1.1",
      "date": "2016-04-04"
    },
    {
      "version": "0.1.2",
      "date": "2016-04-14"
    },
    {
      "version": "0.1.3",
      "date": "2016-08-18"
    },
    {
      "version": "0.1.4",
      "date": "2016-08-29"
    },
    {
      "version": "0.2.0",
      "date": "2018-03-21"
    },
    {
      "version": "0.2.1",
      "date": "2018-03-29"
    },
    {
      "version": "0.2.3",
      "date": "2022-09-23"
    },
    {
      "version": "0.3.0",
      "date": "2022-12-22"
    }
  ],
  "_exports": [
    "chunk_text",
    "count_characters",
    "count_sentences",
    "count_words",
    "tokenize_character_shingles",
    "tokenize_characters",
    "tokenize_lines",
    "tokenize_ngrams",
    "tokenize_paragraphs",
    "tokenize_ptb",
    "tokenize_regex",
    "tokenize_sentences",
    "tokenize_skip_ngrams",
    "tokenize_word_stems",
    "tokenize_words"
  ],
  "_datasets": [
    {
      "name": "mobydick",
      "title": "The text of Moby Dick",
      "object": "mobydick",
      "class": [
        "character"
      ],
      "fields": [],
      "table": false,
      "tojson": true
    }
  ],
  "_help": [
    {
      "page": "basic-tokenizers",
      "title": "Basic tokenizers",
      "topics": [
        "basic-tokenizers",
        "tokenize_characters",
        "tokenize_lines",
        "tokenize_paragraphs",
        "tokenize_regex",
        "tokenize_sentences",
        "tokenize_words"
      ]
    },
    {
      "page": "chunk_text",
      "title": "Chunk text into smaller segments",
      "topics": [
        "chunk_text"
      ]
    },
    {
      "page": "word-counting",
      "title": "Count words, sentences, characters",
      "topics": [
        "count_characters",
        "count_sentences",
        "count_words"
      ]
    },
    {
      "page": "mobydick",
      "title": "The text of Moby Dick",
      "topics": [
        "mobydick"
      ]
    },
    {
      "page": "ngram-tokenizers",
      "title": "N-gram tokenizers",
      "topics": [
        "ngram-tokenizers",
        "tokenize_ngrams",
        "tokenize_skip_ngrams"
      ]
    },
    {
      "page": "shingle-tokenizers",
      "title": "Character shingle tokenizers",
      "topics": [
        "tokenize_character_shingles"
      ]
    },
    {
      "page": "ptb-tokenizer",
      "title": "Penn Treebank Tokenizer",
      "topics": [
        "tokenize_ptb"
      ]
    },
    {
      "page": "stem-tokenizers",
      "title": "Word stem tokenizer",
      "topics": [
        "tokenize_word_stems"
      ]
    },
    {
      "page": "tokenizers",
      "title": "Tokenizers",
      "topics": [
        "tokenizers-package",
        "tokenizers"
      ]
    }
  ],
  "_readme": "https://github.com/ropensci/tokenizers/raw/master/README.md",
  "_rundeps": [
    "Rcpp",
    "SnowballC",
    "stringi"
  ],
  "_sysdeps": [
    {
      "shlib": "libstdc++",
      "package": "libstdc++6",
      "source": "gcc",
      "version": "14.2.0-4ubuntu2~24.04.1",
      "name": "c++",
      "homepage": "http://gcc.gnu.org/",
      "description": "GNU Standard C++ Library v3"
    }
  ],
  "_vignettes": [
    {
      "source": "introduction-to-tokenizers.Rmd",
      "filename": "introduction-to-tokenizers.html",
      "title": "Introduction to the tokenizers Package",
      "author": "Lincoln Mullen",
      "engine": "knitr::rmarkdown",
      "headings": [
        "Package overview",
        "Character and character-shingle tokenizers",
        "Word and word-stem tokenizers",
        "N-gram and skip n-gram tokenizers",
        "Sentence and paragraph tokenizers",
        "Text chunking",
        "Counting words, characters, sentences"
      ],
      "created": "2016-08-11 20:12:37",
      "modified": "2022-12-19 21:14:10",
      "commits": 12
    },
    {
      "source": "tif-and-tokenizers.Rmd",
      "filename": "tif-and-tokenizers.html",
      "title": "The Text Interchange Formats and the tokenizers Package",
      "author": "Lincoln Mullen",
      "engine": "knitr::rmarkdown",
      "headings": [],
      "created": "2018-03-14 00:10:35",
      "modified": "2022-09-23 18:07:51",
      "commits": 5
    }
  ],
  "_score": 13.375413788997777,
  "_indexed": true,
  "_nocasepkg": "tokenizers",
  "_universes": [
    "ropensci",
    "thomaschln"
  ],
  "_binaries": [
    {
      "r": "4.6.0",
      "os": "linux",
      "version": "0.3.1",
      "date": "2026-04-15T08:07:31.000Z",
      "distro": "noble",
      "arch": "aarch64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "d2b2e0edde1b2ba8e85cba80d5bce66a9216be47be2cdd050a8ee07bab9aa6c4",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/24443324861"
    },
    {
      "r": "4.6.0",
      "os": "linux",
      "version": "0.3.1",
      "date": "2026-04-15T08:07:19.000Z",
      "distro": "noble",
      "arch": "x86_64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "592f08f38493a0f4acf62a2e4544475073dc670a07cb8e542189f793031ea501",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/24443324861"
    },
    {
      "r": "4.5.3",
      "os": "linux",
      "version": "0.3.1",
      "date": "2026-04-15T08:07:26.000Z",
      "distro": "noble",
      "arch": "aarch64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "f51d4ee1a23c6866b52c89128f642ee21b0856413a5cef880deaa3f1b675b080",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/24443324861"
    },
    {
      "r": "4.5.3",
      "os": "linux",
      "version": "0.3.1",
      "date": "2026-04-15T08:07:21.000Z",
      "distro": "noble",
      "arch": "x86_64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "faaef2d455b49d89027257c022ccff032ad5e6422c935dad26bfae46c8391b4a",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/24443324861"
    },
    {
      "r": "4.5.3",
      "os": "mac",
      "version": "0.3.1",
      "date": "2026-04-15T08:06:42.000Z",
      "arch": "aarch64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "3961e4632622b1a36a55c4bf72f75c9b7b752a2540ce086f41d92a0535916de7",
      "status": "failure",
      "check": "ERROR",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/24443324861"
    },
    {
      "r": "4.5.3",
      "os": "mac",
      "version": "0.3.1",
      "date": "2026-04-15T08:08:48.000Z",
      "arch": "x86_64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "6ef3804b55f7c46cdef116570e276fe30207167bfa582024aa1efd04177748fc",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/24443324861"
    },
    {
      "r": "4.6.0",
      "os": "mac",
      "version": "0.3.1",
      "date": "2026-04-15T08:06:44.000Z",
      "arch": "aarch64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "ed060c5061566960638792852c30490f662417f37b9cc7e064843c3e3a477e9c",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/24443324861"
    },
    {
      "r": "4.6.0",
      "os": "mac",
      "version": "0.3.1",
      "date": "2026-04-15T08:08:59.000Z",
      "arch": "x86_64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "44ba083888c9f56ff645b5987ab3cb2860570a821d8342138eab20e122aaab5f",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/24443324861"
    },
    {
      "r": "4.5.1",
      "os": "wasm",
      "version": "0.3.1",
      "date": "2026-04-15T08:07:15.000Z",
      "arch": "emscripten",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "2bd532c932dcfd6c933caf595d94cfa0d7c867fe9de6f78049cdfb21a123061a",
      "status": "success",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/24443324861"
    },
    {
      "r": "4.7.0",
      "os": "win",
      "version": "0.3.1",
      "date": "2026-04-15T08:06:30.000Z",
      "arch": "x86_64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "1b6c3ea31ad9271b735a2bb65f82fe35d71b216a1802c1066fb1e656850f28e5",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/24443324861"
    },
    {
      "r": "4.5.3",
      "os": "win",
      "version": "0.3.1",
      "date": "2026-04-15T08:06:17.000Z",
      "arch": "x86_64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "d4806115f375fbe280fa5b3cb012f686a1e7f1a4fe3cb2f7d814f51d74d38108",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/24443324861"
    },
    {
      "r": "4.6.0",
      "os": "win",
      "version": "0.3.1",
      "date": "2026-04-15T08:06:34.000Z",
      "arch": "x86_64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "5f067b41a2357dc3f9b6107336190beacce38abd418039bc57514fe4f43f5a5a",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/24443324861"
    }
  ]
}