{
  "_id": "69b7cc72a485b674afd00656",
  "Package": "tokenizers",
  "Type": "Package",
  "Title": "Fast, Consistent Tokenization of Natural Language Text",
  "Version": "0.3.1",
  "Date": "2024-03-27",
  "Description": "Convert natural language text into tokens. Includes\ntokenizers for shingled n-grams, skip n-grams, words, word\nstems, sentences, paragraphs, characters, shingled characters,\nlines, Penn Treebank, regular expressions, as well as functions\nfor counting characters, words, and sentences, and a function\nfor splitting longer texts into separate documents, each with\nthe same number of words.  The tokenizers have a consistent\ninterface, and the package is built on the 'stringi' and 'Rcpp'\npackages for fast yet correct tokenization in 'UTF-8'.",
  "License": "MIT + file LICENSE",
  "LazyData": "yes",
  "Authors@R": "c(person(\"Thomas\", \"Charlon\", role = c(\"aut\", \"cre\"),\nemail = \"charlon@protonmail.com\",\ncomment = c(ORCID = \"0000-0001-7497-0470\")),\nperson(\"Lincoln\", \"Mullen\", role = c(\"aut\"),\nemail = \"lincoln@lincolnmullen.com\",\ncomment = c(ORCID = \"0000-0001-5103-6917\")),\nperson(\"Os\", \"Keyes\", role = c(\"ctb\"),\nemail = \"ironholds@gmail.com\",\ncomment = c(ORCID = \"0000-0001-5196-609X\")),\nperson(\"Dmitriy\", \"Selivanov\", role = c(\"ctb\"),\nemail = \"selivanov.dmitriy@gmail.com\"),\nperson(\"Jeffrey\", \"Arnold\", role = c(\"ctb\"),\nemail = \"jeffrey.arnold@gmail.com\",\ncomment = c(ORCID = \"0000-0001-9953-3904\")),\nperson(\"Kenneth\", \"Benoit\", role = c(\"ctb\"),\nemail = \"kbenoit@lse.ac.uk\",\ncomment = c(ORCID = \"0000-0002-0797-564X\")))",
  "URL": "https://docs.ropensci.org/tokenizers/,\nhttps://github.com/ropensci/tokenizers",
  "BugReports": "https://github.com/ropensci/tokenizers/issues",
  "RoxygenNote": "7.3.1",
  "Encoding": "UTF-8",
  "VignetteBuilder": "knitr",
  "Config/pak/sysreqs": "libicu-dev",
  "Repository": "https://ropensci.r-universe.dev",
  "Date/Publication": "2024-03-27 09:33:34 UTC",
  "RemoteUrl": "https://github.com/ropensci/tokenizers",
  "RemoteRef": "master",
  "RemoteSha": "b80863d088d4b39695b602ca11e061ac34770ec7",
  "NeedsCompilation": "yes",
  "Packaged": {
    "Date": "2026-03-16 09:19:27 UTC",
    "User": "root"
  },
  "Author": "Thomas Charlon [aut, cre] (ORCID:\n<https://orcid.org/0000-0001-7497-0470>),\nLincoln Mullen [aut] (ORCID: <https://orcid.org/0000-0001-5103-6917>),\nOs Keyes [ctb] (ORCID: <https://orcid.org/0000-0001-5196-609X>),\nDmitriy Selivanov [ctb],\nJeffrey Arnold [ctb] (ORCID: <https://orcid.org/0000-0001-9953-3904>),\nKenneth Benoit [ctb] (ORCID: <https://orcid.org/0000-0002-0797-564X>)",
  "Maintainer": "Thomas Charlon <charlon@protonmail.com>",
  "MD5sum": "4d762cf0ee5ef1c48067e79dad27a2f3",
  "_user": "ropensci",
  "_type": "src",
  "_file": "tokenizers_0.3.1.tar.gz",
  "_fileid": "0153661199d72728fbdf83109b21bba55cafa35523f7e3023b6944e88e6db924",
  "_filesize": 574742,
  "_sha256": "0153661199d72728fbdf83109b21bba55cafa35523f7e3023b6944e88e6db924",
  "_created": "2026-03-16T09:19:27.000Z",
  "_published": "2026-03-16T09:25:06.529Z",
  "_distro": "noble",
  "_jobs": [
    {
      "job": 67201278512,
      "time": 116,
      "config": "linux-devel-arm64",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "5940308845"
    },
    {
      "job": 67201278563,
      "time": 144,
      "config": "linux-devel-x86_64",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "5940315961"
    },
    {
      "job": 67201278516,
      "time": 139,
      "config": "linux-release-arm64",
      "r": "4.5.3",
      "check": "OK",
      "artifact": "5940314704"
    },
    {
      "job": 67201278499,
      "time": 117,
      "config": "linux-release-x86_64",
      "r": "4.5.3",
      "check": "OK",
      "artifact": "5940308968"
    },
    {
      "job": 67201278531,
      "time": 108,
      "config": "macos-devel-arm64",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "5940306851"
    },
    {
      "job": 67201278553,
      "time": 293,
      "config": "macos-devel-x86_64",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "5940355020"
    },
    {
      "job": 67201278518,
      "time": 80,
      "config": "macos-release-arm64",
      "r": "4.5.3",
      "check": "OK",
      "artifact": "5940299827"
    },
    {
      "job": 67201278536,
      "time": 235,
      "config": "macos-release-x86_64",
      "r": "4.5.3",
      "check": "OK",
      "artifact": "5940339642"
    },
    {
      "job": 67200901549,
      "time": 151,
      "config": "pkgdown",
      "r": "4.5.2",
      "check": "OK",
      "artifact": "5940270580"
    },
    {
      "job": 67200901553,
      "time": 180,
      "config": "source",
      "r": "4.5.2",
      "check": "OK",
      "artifact": "5940278020"
    },
    {
      "job": 67201278496,
      "time": 116,
      "config": "wasm-release",
      "r": "4.5.1",
      "check": "OK",
      "artifact": "5940308685"
    },
    {
      "job": 67201278517,
      "time": 121,
      "config": "windows-devel",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "5940310172"
    },
    {
      "job": 67201278525,
      "time": 121,
      "config": "windows-oldrel",
      "r": "4.4.3",
      "check": "NOTE",
      "artifact": "5940309625"
    },
    {
      "job": 67201278506,
      "time": 99,
      "config": "windows-release",
      "r": "4.5.3",
      "check": "OK",
      "artifact": "5940304384"
    }
  ],
  "_buildurl": "https://github.com/r-universe/ropensci/actions/runs/23136284368",
  "_status": "success",
  "_host": "GitHub-Actions",
  "_upstream": "https://github.com/ropensci/tokenizers",
  "_commit": {
    "id": "b80863d088d4b39695b602ca11e061ac34770ec7",
    "author": "tcharlon <charlon@protonmail.com>",
    "committer": "tcharlon <charlon@protonmail.com>",
    "message": "new maintainer\n",
    "time": 1711532014
  },
  "_maintainer": {
    "name": "Thomas Charlon",
    "email": "charlon@protonmail.com",
    "login": "thomaschln",
    "linkedin": "in/thomas-charlon-meng-phd-aba0a3275",
    "orcid": "0000-0001-7497-0470",
    "uuid": 2394508
  },
  "_registered": true,
  "_dependencies": [
    {
      "package": "R",
      "version": ">= 3.1.3",
      "role": "Depends"
    },
    {
      "package": "Rcpp",
      "role": "LinkingTo"
    },
    {
      "package": "stringi",
      "version": ">= 1.0.1",
      "role": "Imports"
    },
    {
      "package": "Rcpp",
      "version": ">= 0.12.3",
      "role": "Imports"
    },
    {
      "package": "SnowballC",
      "version": ">= 0.5.1",
      "role": "Imports"
    },
    {
      "package": "covr",
      "role": "Suggests"
    },
    {
      "package": "knitr",
      "role": "Suggests"
    },
    {
      "package": "rmarkdown",
      "role": "Suggests"
    },
    {
      "package": "stopwords",
      "version": ">= 0.9.0",
      "role": "Suggests"
    },
    {
      "package": "testthat",
      "role": "Suggests"
    }
  ],
  "_owner": "ropensci",
  "_selfowned": true,
  "_usedby": 77,
  "_updates": [],
  "_tags": [],
  "_topics": [
    "nlp",
    "peer-reviewed",
    "text-mining",
    "tokenizer",
    "cpp"
  ],
  "_stars": 187,
  "_contributors": [
    {
      "user": "lmullen",
      "count": 175,
      "uuid": 183672
    },
    {
      "user": "dselivanov",
      "count": 6,
      "uuid": 5123805
    },
    {
      "user": "kbenoit",
      "count": 4,
      "uuid": 2182246
    },
    {
      "user": "jrnold",
      "count": 4,
      "uuid": 123968
    },
    {
      "user": "chrismuir",
      "count": 1,
      "uuid": 13386824
    },
    {
      "user": "emilhvitfeldt",
      "count": 1,
      "uuid": 14034784
    },
    {
      "user": "hideaki",
      "count": 1,
      "uuid": 19518
    },
    {
      "user": "jeroen",
      "count": 1,
      "uuid": 216319
    },
    {
      "user": "juliasilge",
      "count": 1,
      "uuid": 12505835
    },
    {
      "user": "karthik",
      "count": 1,
      "uuid": 138494
    },
    {
      "user": "maelle",
      "count": 1,
      "uuid": 8360597
    },
    {
      "user": "ironholds",
      "count": 1,
      "uuid": 2487262
    },
    {
      "user": "thomaschln",
      "count": 1,
      "uuid": 2394508
    }
  ],
  "_userbio": {
    "uuid": 1200269,
    "type": "organization",
    "name": "rOpenSci",
    "description": "Tools and R Packages for Open Science"
  },
  "_downloads": {
    "count": 39491,
    "source": "https://cranlogs.r-pkg.org/downloads/total/last-month/tokenizers"
  },
  "_mentions": 1,
  "_devurl": "https://github.com/ropensci/tokenizers",
  "_pkgdown": "https://docs.ropensci.org/tokenizers/",
  "_searchresults": 1116,
  "_metadata": {
    "review": {
      "id": 33,
      "status": "reviewed",
      "version": "0.1.1",
      "organization": "rOpenSci Software Review",
      "url": "https://github.com/ropensci/software-review/issues/33"
    },
    "ropensci_category": "scalereprod"
  },
  "_rbuild": "4.5.2",
  "_assets": [
    "extra/citation.cff",
    "extra/citation.html",
    "extra/citation.json",
    "extra/citation.txt",
    "extra/contents.json",
    "extra/NEWS.html",
    "extra/NEWS.txt",
    "extra/readme.html",
    "extra/readme.md",
    "extra/tokenizers.html",
    "manual.pdf"
  ],
  "_homeurl": "https://github.com/ropensci/tokenizers",
  "_realowner": "ropensci",
  "_cranurl": true,
  "_releases": [
    {
      "version": "0.1.0",
      "date": "2016-04-02"
    },
    {
      "version": "0.1.1",
      "date": "2016-04-04"
    },
    {
      "version": "0.1.2",
      "date": "2016-04-14"
    },
    {
      "version": "0.1.3",
      "date": "2016-08-18"
    },
    {
      "version": "0.1.4",
      "date": "2016-08-29"
    },
    {
      "version": "0.2.0",
      "date": "2018-03-21"
    },
    {
      "version": "0.2.1",
      "date": "2018-03-29"
    },
    {
      "version": "0.2.3",
      "date": "2022-09-23"
    },
    {
      "version": "0.3.0",
      "date": "2022-12-22"
    }
  ],
  "_exports": [
    "chunk_text",
    "count_characters",
    "count_sentences",
    "count_words",
    "tokenize_character_shingles",
    "tokenize_characters",
    "tokenize_lines",
    "tokenize_ngrams",
    "tokenize_paragraphs",
    "tokenize_ptb",
    "tokenize_regex",
    "tokenize_sentences",
    "tokenize_skip_ngrams",
    "tokenize_word_stems",
    "tokenize_words"
  ],
  "_datasets": [
    {
      "name": "mobydick",
      "title": "The text of Moby Dick",
      "object": "mobydick",
      "class": [
        "character"
      ],
      "fields": [],
      "table": false,
      "tojson": true
    }
  ],
  "_help": [
    {
      "page": "basic-tokenizers",
      "title": "Basic tokenizers",
      "topics": [
        "basic-tokenizers",
        "tokenize_characters",
        "tokenize_lines",
        "tokenize_paragraphs",
        "tokenize_regex",
        "tokenize_sentences",
        "tokenize_words"
      ]
    },
    {
      "page": "chunk_text",
      "title": "Chunk text into smaller segments",
      "topics": [
        "chunk_text"
      ]
    },
    {
      "page": "word-counting",
      "title": "Count words, sentences, characters",
      "topics": [
        "count_characters",
        "count_sentences",
        "count_words"
      ]
    },
    {
      "page": "mobydick",
      "title": "The text of Moby Dick",
      "topics": [
        "mobydick"
      ]
    },
    {
      "page": "ngram-tokenizers",
      "title": "N-gram tokenizers",
      "topics": [
        "ngram-tokenizers",
        "tokenize_ngrams",
        "tokenize_skip_ngrams"
      ]
    },
    {
      "page": "shingle-tokenizers",
      "title": "Character shingle tokenizers",
      "topics": [
        "tokenize_character_shingles"
      ]
    },
    {
      "page": "ptb-tokenizer",
      "title": "Penn Treebank Tokenizer",
      "topics": [
        "tokenize_ptb"
      ]
    },
    {
      "page": "stem-tokenizers",
      "title": "Word stem tokenizer",
      "topics": [
        "tokenize_word_stems"
      ]
    },
    {
      "page": "tokenizers",
      "title": "Tokenizers",
      "topics": [
        "tokenizers-package",
        "tokenizers"
      ]
    }
  ],
  "_readme": "https://github.com/ropensci/tokenizers/raw/master/README.md",
  "_rundeps": [
    "Rcpp",
    "SnowballC",
    "stringi"
  ],
  "_sysdeps": [
    {
      "shlib": "libstdc++",
      "package": "libstdc++6",
      "source": "gcc",
      "version": "14.2.0-4ubuntu2~24.04.1",
      "name": "c++",
      "homepage": "http://gcc.gnu.org/",
      "description": "GNU Standard C++ Library v3"
    }
  ],
  "_vignettes": [
    {
      "source": "introduction-to-tokenizers.Rmd",
      "filename": "introduction-to-tokenizers.html",
      "title": "Introduction to the tokenizers Package",
      "author": "Lincoln Mullen",
      "engine": "knitr::rmarkdown",
      "headings": [
        "Package overview",
        "Character and character-shingle tokenizers",
        "Word and word-stem tokenizers",
        "N-gram and skip n-gram tokenizers",
        "Sentence and paragraph tokenizers",
        "Text chunking",
        "Counting words, characters, sentences"
      ],
      "created": "2016-08-11 20:12:37",
      "modified": "2022-12-19 21:14:10",
      "commits": 12
    },
    {
      "source": "tif-and-tokenizers.Rmd",
      "filename": "tif-and-tokenizers.html",
      "title": "The Text Interchange Formats and the tokenizers Package",
      "author": "Lincoln Mullen",
      "engine": "knitr::rmarkdown",
      "headings": [],
      "created": "2018-03-14 00:10:35",
      "modified": "2022-09-23 18:07:51",
      "commits": 5
    }
  ],
  "_score": 13.358797158256337,
  "_indexed": true,
  "_nocasepkg": "tokenizers",
  "_universes": [
    "ropensci",
    "thomaschln"
  ],
  "_binaries": [
    {
      "r": "4.6.0",
      "os": "linux",
      "version": "0.3.1",
      "date": "2026-03-16T09:21:24.000Z",
      "distro": "noble",
      "arch": "aarch64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "1d024d6b0ccdda7019ed0b838e35a9b9662eed9d8173ae9b994bbece1c30d93a",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/23136284368"
    },
    {
      "r": "4.6.0",
      "os": "linux",
      "version": "0.3.1",
      "date": "2026-03-16T09:21:47.000Z",
      "distro": "noble",
      "arch": "x86_64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "d87101bc302a299ef1ebafdc02e309928bf9ebd505a851fdfa6d91ad8430d074",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/23136284368"
    },
    {
      "r": "4.5.3",
      "os": "linux",
      "version": "0.3.1",
      "date": "2026-03-16T09:21:47.000Z",
      "distro": "noble",
      "arch": "aarch64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "8b1adda33bb5749d7b1eb7ac3d243c0b1fb18d8dde7d390f279f2e1892c1724e",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/23136284368"
    },
    {
      "r": "4.5.3",
      "os": "linux",
      "version": "0.3.1",
      "date": "2026-03-16T09:21:22.000Z",
      "distro": "noble",
      "arch": "x86_64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "4b7b06e32869ed2e81973ae3b6d02980e0a66ce1e15f64eff7b0e9530ab26f55",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/23136284368"
    },
    {
      "r": "4.6.0",
      "os": "mac",
      "version": "0.3.1",
      "date": "2026-03-16T09:21:12.000Z",
      "arch": "aarch64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "9ce9eb3b98f39a48d2c14b398b9563231210c866a5d94c23f7111f1064bd0a71",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/23136284368"
    },
    {
      "r": "4.6.0",
      "os": "mac",
      "version": "0.3.1",
      "date": "2026-03-16T09:23:03.000Z",
      "arch": "x86_64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "c979ca23be44fda2f9ad82ca349a9bf92a08628fb11a0bdcfb2272ed36254cd6",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/23136284368"
    },
    {
      "r": "4.5.3",
      "os": "mac",
      "version": "0.3.1",
      "date": "2026-03-16T09:20:52.000Z",
      "arch": "aarch64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "ca4a14cf50e5483307cba61455a007a6ad297575cc8e6e34d8d2bded987a249f",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/23136284368"
    },
    {
      "r": "4.5.3",
      "os": "mac",
      "version": "0.3.1",
      "date": "2026-03-16T09:22:57.000Z",
      "arch": "x86_64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "cf51ac33162bca33166e75f5b3d8e0fd3c565bd425b7416a73056bd7032c61d8",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/23136284368"
    },
    {
      "r": "4.5.1",
      "os": "wasm",
      "version": "0.3.1",
      "date": "2026-03-16T09:21:36.000Z",
      "arch": "emscripten",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "054779f2f2f91e57420471bfad4f5f3972081c3162ed892fd3718d2f8501a451",
      "status": "success",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/23136284368"
    },
    {
      "r": "4.6.0",
      "os": "win",
      "version": "0.3.1",
      "date": "2026-03-16T09:21:02.000Z",
      "arch": "x86_64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "b32a762823cc661402e5413f75172c8adf11a92ce7330ebb1c9e97a32daa064e",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/23136284368"
    },
    {
      "r": "4.5.3",
      "os": "win",
      "version": "0.3.1",
      "date": "2026-03-16T09:20:42.000Z",
      "arch": "x86_64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "0508ffe5cb7ea981326399ef57d1898be07ba43dc5bd02ef93abff6f1a064aa0",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/23136284368"
    }
  ]
}