Title: | Data Version Control for the Targets Package |
---|---|
Description: | In computationally demanding data analysis pipelines, the 'targets' R package (2021, <doi:10.21105/joss.02959>) maintains an up-to-date set of results while skipping tasks that do not need to rerun. This process increases speed and increases trust in the final end product. However, it also overwrites old output with new output, and past results disappear by default. To preserve historical output, the 'gittargets' package captures version-controlled snapshots of the data store, and each snapshot links to the underlying commit of the source code. That way, when the user rolls back the code to a previous branch or commit, 'gittargets' can recover the data contemporaneous with that commit so that all targets remain up to date. |
Authors: | William Michael Landau [aut, cre] , Saras Windecker [rev], David Neuzerling [rev], Eli Lilly and Company [cph] |
Maintainer: | William Michael Landau <[email protected]> |
License: | MIT + file LICENSE |
Version: | 0.0.7.9000 |
Built: | 2024-12-14 06:22:37 UTC |
Source: | https://github.com/ropensci/gittargets |
In computationally demanding data analysis pipelines,
the targets
R package maintains an up-to-date set of results
while skipping tasks that do not need to rerun. This process
increases speed and increases trust in the final end product.
However, it also overwrites old output with new output,
and past results disappear by default. To preserve historical output,
the gittargets
package captures version-controlled snapshots
of the data store, and each snapshot links to the underlying
commit of the source code. That way, when the user rolls back
the code to a previous branch or commit, gittargets
can recover
the data contemporaneous with that commit so that all targets
remain up to date.
Check out a snapshot of the data associated with
a particular code commit (default: HEAD
).
tar_git_checkout( ref = "HEAD", code = getwd(), store = targets::tar_config_get("store"), force = FALSE, verbose = TRUE )
tar_git_checkout( ref = "HEAD", code = getwd(), store = targets::tar_config_get("store"), force = FALSE, verbose = TRUE )
ref |
Character of length 1. SHA1 hash, branch name,
or other reference in the code repository
that points to a code commit. (You can also identify the code
commit by supplying a data branch of the form Once the desired code commit is identified,
If |
code |
Character of length 1, directory path to the code repository,
usually the root of the |
store |
Character of length 1, path to the data store of the pipeline.
If |
force |
ignore conflicts and overwrite modified files |
verbose |
Logical of length 1, whether to print R console messages confirming that a snapshot was created. |
Nothing (invisibly).
Other git:
tar_git_init()
,
tar_git_log()
,
tar_git_ok()
,
tar_git_snapshot()
,
tar_git_status_code()
,
tar_git_status_data()
,
tar_git_status_targets()
,
tar_git_status()
if (Sys.getenv("TAR_EXAMPLES") == "true" && tar_git_ok(verbose = FALSE)) { targets::tar_dir({ # Containing code does not modify the user's filespace. # Work on an initial branch. targets::tar_script(tar_target(data, "old_data")) targets::tar_make() targets::tar_read(data) # "old_data" gert::git_init() gert::git_add("_targets.R") gert::git_commit("First commit") gert::git_branch_create("old_branch") tar_git_init() # Work on a new branch. tar_git_snapshot(status = FALSE, verbose = FALSE) targets::tar_script(tar_target(data, "new_data")) targets::tar_make() targets::tar_read(data) # "new_data" gert::git_branch_create("new_branch") gert::git_add("_targets.R") gert::git_commit("Second commit") tar_git_snapshot(status = FALSE, verbose = FALSE) # Go back to the old branch. gert::git_branch_checkout("old_branch") # The target is out of date because we only reverted the code. targets::tar_outdated() # But tar_git_checkout() lets us restore the old version of the data! tar_git_checkout() targets::tar_read(data) # "old_data" # Now, the target is up to date! And we did not even have to rerun it! targets::tar_outdated() }) }
if (Sys.getenv("TAR_EXAMPLES") == "true" && tar_git_ok(verbose = FALSE)) { targets::tar_dir({ # Containing code does not modify the user's filespace. # Work on an initial branch. targets::tar_script(tar_target(data, "old_data")) targets::tar_make() targets::tar_read(data) # "old_data" gert::git_init() gert::git_add("_targets.R") gert::git_commit("First commit") gert::git_branch_create("old_branch") tar_git_init() # Work on a new branch. tar_git_snapshot(status = FALSE, verbose = FALSE) targets::tar_script(tar_target(data, "new_data")) targets::tar_make() targets::tar_read(data) # "new_data" gert::git_branch_create("new_branch") gert::git_add("_targets.R") gert::git_commit("Second commit") tar_git_snapshot(status = FALSE, verbose = FALSE) # Go back to the old branch. gert::git_branch_checkout("old_branch") # The target is out of date because we only reverted the code. targets::tar_outdated() # But tar_git_checkout() lets us restore the old version of the data! tar_git_checkout() targets::tar_read(data) # "old_data" # Now, the target is up to date! And we did not even have to rerun it! targets::tar_outdated() }) }
Initialize a Git repository for a targets
data store.
tar_git_init( store = targets::tar_config_get("store"), stash_gitignore = TRUE, git_lfs = TRUE, verbose = TRUE )
tar_git_init( store = targets::tar_config_get("store"), stash_gitignore = TRUE, git_lfs = TRUE, verbose = TRUE )
store |
Character of length 1, path to the data store of the pipeline.
If |
stash_gitignore |
Logical of length 1, whether to temporarily
stash the |
git_lfs |
Logical, whether to automatically opt into Git LFS to track
large files in |
verbose |
Logical of length 1, whether to print messages to the R console. |
tar_git_init()
also writes a .gitattributes
file to the
store to automatically track target output date with git-lfs
if it is installed on your system.
NULL
(invisibly).
The targets
package writes a .gitignore
file to new data stores
in order to prevent accidental commits to the code Git repository.
Unfortunately, for gittargets
, this automatic .gitignore
file
interferes with proper data versioning. So by default, gittargets
temporarily stashes it to a hidden file called .gittargets_gitignore
inside the data store. If your R program crashes while the stash
is active, you can simply move it manually back to .gitignore
or run tar_git_status_data()
to restore the stash automatically
if no .gitignore
already exists.
Other git:
tar_git_checkout()
,
tar_git_log()
,
tar_git_ok()
,
tar_git_snapshot()
,
tar_git_status_code()
,
tar_git_status_data()
,
tar_git_status_targets()
,
tar_git_status()
if (Sys.getenv("TAR_EXAMPLES") == "true" && tar_git_ok(verbose = FALSE)) { targets::tar_dir({ # Containing code does not modify the user's file space. targets::tar_script(tar_target(data, 1)) targets::tar_make() tar_git_init() }) }
if (Sys.getenv("TAR_EXAMPLES") == "true" && tar_git_ok(verbose = FALSE)) { targets::tar_dir({ # Containing code does not modify the user's file space. targets::tar_script(tar_target(data, 1)) targets::tar_make() tar_git_init() }) }
Show all the data snapshots of a code branch.
tar_git_log( code = getwd(), store = targets::tar_config_get("store"), branch = gert::git_branch(repo = code), max = 100 )
tar_git_log( code = getwd(), store = targets::tar_config_get("store"), branch = gert::git_branch(repo = code), max = 100 )
code |
Character of length 1, directory path to the code repository,
usually the root of the |
store |
Character of length 1, path to the data store of the pipeline.
If |
branch |
Character of length 1, name of the code repository branch to query. Defaults to the currently checked-out code branch. |
max |
Positive numeric of length 1, maximum number of code commits to inspect for the given branch. |
By design, tar_git_log()
only queries a single
code branch at a time. This allows tar_git_log()
to report more detailed information about the snapshots
of the given code branch.
To query all data snapshots over all branches, simply run
gert::git_branch_list(local = TRUE, repo = "_targets")
.
The valid snapshots show "code=<SHA1>"
in the name
column,
where <SHA1>
is the Git commit hash of the code commit
corresponding to the data snapshot.
A data frame of information about data snapshots and code commits.
Other git:
tar_git_checkout()
,
tar_git_init()
,
tar_git_ok()
,
tar_git_snapshot()
,
tar_git_status_code()
,
tar_git_status_data()
,
tar_git_status_targets()
,
tar_git_status()
if (Sys.getenv("TAR_EXAMPLES") == "true" && tar_git_ok(verbose = FALSE)) { targets::tar_dir({ # Containing code does not modify the user's filespace. targets::tar_script(tar_target(data, 1)) targets::tar_make() gert::git_init() gert::git_add("_targets.R") gert::git_commit("First commit") tar_git_init() tar_git_snapshot(status = FALSE, verbose = FALSE) tar_git_log() }) }
if (Sys.getenv("TAR_EXAMPLES") == "true" && tar_git_ok(verbose = FALSE)) { targets::tar_dir({ # Containing code does not modify the user's filespace. targets::tar_script(tar_target(data, 1)) targets::tar_make() gert::git_init() gert::git_add("_targets.R") gert::git_commit("First commit") tar_git_init() tar_git_snapshot(status = FALSE, verbose = FALSE) tar_git_log() }) }
Check if Git is installed and if user.name
and user.email
are configured globally.
tar_git_ok(verbose = TRUE)
tar_git_ok(verbose = TRUE)
verbose |
Whether to print messages to the console. |
You can install Git from https://git-scm.com/downloads/
and configure your identity using the instructions at
https://git-scm.com/book/en/v2/Getting-Started-First-Time-Git-Setup.
You may find it convenient to run gert::git_config_global()
with name
equal to user.name
and user.email
.
Logical of length 1, whether Git is installed and configured correctly.
Other git:
tar_git_checkout()
,
tar_git_init()
,
tar_git_log()
,
tar_git_snapshot()
,
tar_git_status_code()
,
tar_git_status_data()
,
tar_git_status_targets()
,
tar_git_status()
tar_git_ok()
tar_git_ok()
Snapshot the Git data repository of a targets
project.
tar_git_snapshot( message = NULL, ref = "HEAD", code = getwd(), script = targets::tar_config_get("script"), store = targets::tar_config_get("store"), stash_gitignore = TRUE, reporter = targets::tar_config_get("reporter_outdated"), envir = parent.frame(), callr_function = callr::r, callr_arguments = NULL, status = interactive(), force = FALSE, pack_refs = TRUE, verbose = TRUE )
tar_git_snapshot( message = NULL, ref = "HEAD", code = getwd(), script = targets::tar_config_get("script"), store = targets::tar_config_get("store"), stash_gitignore = TRUE, reporter = targets::tar_config_get("reporter_outdated"), envir = parent.frame(), callr_function = callr::r, callr_arguments = NULL, status = interactive(), force = FALSE, pack_refs = TRUE, verbose = TRUE )
message |
Optional Git commit message of the data snapshot.
If |
ref |
Character of length 1, reference (branch name, Git SHA1 hash, etc.) of the code commit that will map to the new data snapshot. Defaults to the commit checked out right now. |
code |
Character of length 1, directory path to the code repository,
usually the root of the |
script |
Character of length 1, path to the
target script file. Defaults to |
store |
Character of length 1, path to the data store of the pipeline.
If |
stash_gitignore |
Logical of length 1, whether to temporarily
stash the |
reporter |
Character of length 1, name of the reporter to user. Controls how messages are printed as targets are checked. Choices:
|
envir |
An environment, where to run the target R script
(default: The |
callr_function |
A function from |
callr_arguments |
A list of arguments to |
status |
Logical of length 1, whether to print the project status
with |
force |
Logical of length 1. Force checkout the data branch of an existing data snapshot of the current code commit? |
pack_refs |
Logical of length 1, whether to run |
verbose |
Logical of length 1, whether to print R console messages confirming that a snapshot was created. |
A Git-backed gittargets
data snapshot is a special kind of
Git commit. Every data commit is part of a branch specific to
the current code commit.
That way, when you switch branches or commits in the code,
tar_git_checkout()
checks out the latest data snapshot
that matches the code in your workspace.
That way, your targets can stay up to date even as you
transition among multiple branches.
The targets
package writes a .gitignore
file to new data stores
in order to prevent accidental commits to the code Git repository.
Unfortunately, for gittargets
, this automatic .gitignore
file
interferes with proper data versioning. So by default, gittargets
temporarily stashes it to a hidden file called .gittargets_gitignore
inside the data store. If your R program crashes while the stash
is active, you can simply move it manually back to .gitignore
or run tar_git_status_data()
to restore the stash automatically
if no .gitignore
already exists.
Other git:
tar_git_checkout()
,
tar_git_init()
,
tar_git_log()
,
tar_git_ok()
,
tar_git_status_code()
,
tar_git_status_data()
,
tar_git_status_targets()
,
tar_git_status()
if (Sys.getenv("TAR_EXAMPLES") == "true" && tar_git_ok(verbose = FALSE)) { targets::tar_dir({ # Containing code does not modify the user's filespace. targets::tar_script(tar_target(data, 1)) targets::tar_make() gert::git_init() gert::git_add("_targets.R") gert::git_commit("First commit") tar_git_init() tar_git_snapshot(status = FALSE) }) }
if (Sys.getenv("TAR_EXAMPLES") == "true" && tar_git_ok(verbose = FALSE)) { targets::tar_dir({ # Containing code does not modify the user's filespace. targets::tar_script(tar_target(data, 1)) targets::tar_make() gert::git_init() gert::git_add("_targets.R") gert::git_commit("First commit") tar_git_init() tar_git_snapshot(status = FALSE) }) }
Print the status of the code repository, the data repository, and the targets.
tar_git_status( code = getwd(), script = targets::tar_config_get("script"), store = targets::tar_config_get("store"), stash_gitignore = TRUE, reporter = targets::tar_config_get("reporter_outdated"), envir = parent.frame(), callr_function = callr::r, callr_arguments = NULL )
tar_git_status( code = getwd(), script = targets::tar_config_get("script"), store = targets::tar_config_get("store"), stash_gitignore = TRUE, reporter = targets::tar_config_get("reporter_outdated"), envir = parent.frame(), callr_function = callr::r, callr_arguments = NULL )
code |
Character of length 1, directory path to the code repository,
usually the root of the |
script |
Character of length 1, path to the
target script file. Defaults to |
store |
Character of length 1, path to the data store of the pipeline.
If |
stash_gitignore |
Logical of length 1, whether to temporarily
stash the |
reporter |
Character of length 1, name of the reporter to user. Controls how messages are printed as targets are checked. Choices:
|
envir |
An environment, where to run the target R script
(default: The |
callr_function |
A function from |
callr_arguments |
A list of arguments to |
NULL
(invisibly). Status information is printed
to the R console.
The targets
package writes a .gitignore
file to new data stores
in order to prevent accidental commits to the code Git repository.
Unfortunately, for gittargets
, this automatic .gitignore
file
interferes with proper data versioning. So by default, gittargets
temporarily stashes it to a hidden file called .gittargets_gitignore
inside the data store. If your R program crashes while the stash
is active, you can simply move it manually back to .gitignore
or run tar_git_status_data()
to restore the stash automatically
if no .gitignore
already exists.
Other git:
tar_git_checkout()
,
tar_git_init()
,
tar_git_log()
,
tar_git_ok()
,
tar_git_snapshot()
,
tar_git_status_code()
,
tar_git_status_data()
,
tar_git_status_targets()
if (Sys.getenv("TAR_EXAMPLES") == "true" && tar_git_ok(verbose = FALSE)) { targets::tar_dir({ # Containing code does not modify the user's files pace. targets::tar_script(tar_target(data, 1)) targets::tar_make() list.files("_targets", all.files = TRUE) gert::git_init() tar_git_init() tar_git_status() }) }
if (Sys.getenv("TAR_EXAMPLES") == "true" && tar_git_ok(verbose = FALSE)) { targets::tar_dir({ # Containing code does not modify the user's files pace. targets::tar_script(tar_target(data, 1)) targets::tar_make() list.files("_targets", all.files = TRUE) gert::git_init() tar_git_init() tar_git_status() }) }
Show the Git status of the code repository.
tar_git_status_code(code = getwd())
tar_git_status_code(code = getwd())
code |
Character of length 1, directory path to the code repository,
usually the root of the |
If the code repository exists, the return value is the data frame
produced by gert::git_status(repo = code)
. If the code has no Git
repository, then the return value is NULL
.
Other git:
tar_git_checkout()
,
tar_git_init()
,
tar_git_log()
,
tar_git_ok()
,
tar_git_snapshot()
,
tar_git_status_data()
,
tar_git_status_targets()
,
tar_git_status()
if (Sys.getenv("TAR_EXAMPLES") == "true" && tar_git_ok(verbose = FALSE)) { targets::tar_dir({ # Containing code does not modify the user's file space. targets::tar_script(tar_target(data, 1)) targets::tar_make() list.files("_targets", all.files = TRUE) gert::git_init() tar_git_init() tar_git_status_code() }) }
if (Sys.getenv("TAR_EXAMPLES") == "true" && tar_git_ok(verbose = FALSE)) { targets::tar_dir({ # Containing code does not modify the user's file space. targets::tar_script(tar_target(data, 1)) targets::tar_make() list.files("_targets", all.files = TRUE) gert::git_init() tar_git_init() tar_git_status_code() }) }
Show the Git status of the data repository.
tar_git_status_data( store = targets::tar_config_get("store"), stash_gitignore = TRUE )
tar_git_status_data( store = targets::tar_config_get("store"), stash_gitignore = TRUE )
store |
Character of length 1, path to the data store of the pipeline.
If |
stash_gitignore |
Logical of length 1, whether to temporarily
stash the |
If the data repository exists, the return value is the data frame
produced by gert::git_status(repo = store)
. If the data store has no Git
repository, then the return value is NULL
.
The targets
package writes a .gitignore
file to new data stores
in order to prevent accidental commits to the code Git repository.
Unfortunately, for gittargets
, this automatic .gitignore
file
interferes with proper data versioning. So by default, gittargets
temporarily stashes it to a hidden file called .gittargets_gitignore
inside the data store. If your R program crashes while the stash
is active, you can simply move it manually back to .gitignore
or run tar_git_status_data()
to restore the stash automatically
if no .gitignore
already exists.
Other git:
tar_git_checkout()
,
tar_git_init()
,
tar_git_log()
,
tar_git_ok()
,
tar_git_snapshot()
,
tar_git_status_code()
,
tar_git_status_targets()
,
tar_git_status()
if (Sys.getenv("TAR_EXAMPLES") == "true" && tar_git_ok(verbose = FALSE)) { targets::tar_dir({ # Containing code does not modify the user's file space. targets::tar_script(tar_target(data, 1)) targets::tar_make() list.files("_targets", all.files = TRUE) gert::git_init() tar_git_init() tar_git_status_data() }) }
if (Sys.getenv("TAR_EXAMPLES") == "true" && tar_git_ok(verbose = FALSE)) { targets::tar_dir({ # Containing code does not modify the user's file space. targets::tar_script(tar_target(data, 1)) targets::tar_make() list.files("_targets", all.files = TRUE) gert::git_init() tar_git_init() tar_git_status_data() }) }
Show which targets are outdated.
tar_git_status_targets( script = targets::tar_config_get("script"), store = targets::tar_config_get("store"), reporter = targets::tar_config_get("reporter_outdated"), envir = parent.frame(), callr_function = callr::r, callr_arguments = NULL )
tar_git_status_targets( script = targets::tar_config_get("script"), store = targets::tar_config_get("store"), reporter = targets::tar_config_get("reporter_outdated"), envir = parent.frame(), callr_function = callr::r, callr_arguments = NULL )
script |
Character of length 1, path to the
target script file. Defaults to |
store |
Character of length 1, path to the
|
reporter |
Character of length 1, name of the reporter to user. Controls how messages are printed as targets are checked. Choices:
|
envir |
An environment, where to run the target R script
(default: The |
callr_function |
A function from |
callr_arguments |
A list of arguments to |
This function has prettier output than targets::tar_outdated()
,
and it mainly serves tar_git_status()
.
A tibble
with the names of outdated targets.
Other git:
tar_git_checkout()
,
tar_git_init()
,
tar_git_log()
,
tar_git_ok()
,
tar_git_snapshot()
,
tar_git_status_code()
,
tar_git_status_data()
,
tar_git_status()
targets::tar_dir({ # Containing code does not modify the user's file space. targets::tar_script(tar_target(data, 1)) targets::tar_make() list.files("_targets", all.files = TRUE) tar_git_status_targets() })
targets::tar_dir({ # Containing code does not modify the user's file space. targets::tar_script(tar_target(data, 1)) targets::tar_make() list.files("_targets", all.files = TRUE) tar_git_status_targets() })