编辑:将旧的解决方案留在底部,因为它获得了赏金和很多选票,但提出了改进的答案
您可以使用{建设性}包 https://github.com/cynkra/constructive :
# install.packages("constructive")
# or dev version:
# remotes::install_github("cynkra/constructive")
Df <- data.frame(A = c(2, 2, 2, 6, 7, 8),
B = c("A", "G", "N", NA, "L", "L"),
C = c(1L, 3L, 5L, NA, NA, NA))
constructive::construct(Df)
#> data.frame(
#> A = c(2, 2, 2, 6, 7, 8),
#> B = c("A", "G", "N", NA, "L", "L"),
#> C = c(1L, 3L, 5L, NA, NA, NA)
#> )
它具有许多常见类的自定义构造函数,因此它应该能够以人类可读的方式忠实地重现大多数对象。
旧的解决方案:
3个解决方案:
-
一个包装纸dput
(处理标准data.frames
, tibbles
and lists
)
-
a read.table
解决方案(对于data.frames
)
-
a tibble::tribble
解决方案(对于data.frames
,返回一个tibble
)
全部包括n
and random
参数允许仅输出数据的头部或动态采样。
dput_small1(Df)
# Df <- data.frame(
# A = c(2, 2, 2, 6, 7, 8),
# B = structure(c(1L, 2L, 4L, NA, 3L, 3L), .Label = c("A", "G", "L",
# "N"), class = "factor"),
# C = c(1L, 3L, 5L, NA, NA, NA) ,
# stringsAsFactors=FALSE)
dput_small2(Df,stringsAsFactors=TRUE)
# Df <- read.table(sep="\t", text="
# A B C
# 2 A 1
# 2 G 3
# 2 N 5
# 6 NA NA
# 7 L NA
# 8 L NA", header=TRUE, stringsAsFactors=TRUE)
dput_small3(Df)
# Df <- tibble::tribble(
# ~A, ~B, ~C,
# 2, "A", 1L,
# 2, "G", 3L,
# 2, "N", 5L,
# 6, NA_character_, NA_integer_,
# 7, "L", NA_integer_,
# 8, "L", NA_integer_
# )
# Df$B <- factor(Df$B)
包装周围dput
该选项给出的输出非常接近问题中提出的输出。它很一般,因为它实际上是环绕的dput
,但单独应用于列。
multiline
means '将 dput 的默认输出保留为多行'.
dput_small1<- function(x,
name=as.character(substitute(x)),
multiline = TRUE,
n=if ('list' %in% class(x)) length(x) else nrow(x),
random=FALSE,
seed = 1){
name
if('tbl_df' %in% class(x)) create_fun <- "tibble::tibble" else
if('list' %in% class(x)) create_fun <- "list" else
if('data.table' %in% class(x)) create_fun <- "data.table::data.table" else
create_fun <- "data.frame"
if(random) {
set.seed(seed)
if(create_fun == "list") x <- x[sample(1:length(x),n)] else
x <- x[sample(1:nrow(x),n),]
} else {
x <- head(x,n)
}
line_sep <- if (multiline) "\n " else ""
cat(sep='',name," <- ",create_fun,"(\n ",
paste0(unlist(
Map(function(item,nm) paste0(nm,if(nm=="") "" else " = ",paste(capture.output(dput(item)),collapse=line_sep)),
x,if(is.null(names(x))) rep("",length(x)) else names(x))),
collapse=",\n "),
if(create_fun == "data.frame") ",\n stringsAsFactors = FALSE)" else "\n)")
}
dput_small1(list(1,2,c=3,d=4),"my_list",random=TRUE,n=3)
# my_list <- list(
# 2,
# d = 4,
# c = 3
# )
read.table
解决方案
For data.frames
然而,我发现以更明确/表格的格式输入很舒服。
这可以通过使用来实现read.table
,然后自动重新格式化列的类型read.table
不会得到正确的。不像第一个解决方案那么通用,但对于 95% 的情况可以顺利工作SO
.
dput_small2 <- function(df,
name=as.character(substitute(df)),
sep='\t',
header=TRUE,
stringsAsFactors = FALSE,
n= nrow(df),
random=FALSE,
seed = 1){
name
if(random) {
set.seed(seed)
df <- df[sample(1:nrow(df),n),]
} else {
df <- head(df,n)
}
cat(sep='',name,' <- read.table(sep="',sub('\t','\\\\t',sep),'", text="\n ',
paste(colnames(df),collapse=sep))
df <- head(df,n)
apply(df,1,function(x) cat(sep='','\n ',paste(x,collapse=sep)))
cat(sep='','", header=',header,', stringsAsFactors=',stringsAsFactors,')')
sapply(names(df), function(x){
if(is.character(df[[x]]) & suppressWarnings(identical(as.character(as.numeric(df[[x]])),df[[x]]))){ # if it's a character column containing numbers
cat(sep='','\n',name,'$',x,' <- as.character(', name,'$',x,')')
} else if(is.factor(df[[x]]) & !stringsAsFactors) { # if it's a factor and conversion is not automated
cat(sep='','\n',name,'$',x,' <- factor(', name,'$',x,')')
} else if(inherits(df[[x]], "POSIXct")){
cat(sep='','\n',name,'$',x,' <- as.POSIXct(', name,'$',x,')')
} else if(inherits(df[[x]], "Date")){
cat(sep='','\n',name,'$',x,' <- as.Date(', name,'$',x,')')
}})
invisible(NULL)
}
最简单的情况
dput_small2(iris,n=6)
将打印:
iris <- read.table(sep="\t", text="
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
5.1 3.5 1.4 0.2 setosa
4.9 3.0 1.4 0.2 setosa
4.7 3.2 1.3 0.2 setosa
4.6 3.1 1.5 0.2 setosa
5.0 3.6 1.4 0.2 setosa
5.4 3.9 1.7 0.4 setosa", header=TRUE, stringsAsFactors=FALSE)
执行时将返回:
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# 1 5.1 3.5 1.4 0.2 setosa
# 2 4.9 3.0 1.4 0.2 setosa
# 3 4.7 3.2 1.3 0.2 setosa
# 4 4.6 3.1 1.5 0.2 setosa
# 5 5.0 3.6 1.4 0.2 setosa
# 6 5.4 3.9 1.7 0.4 setosa
str(iris)
# 'data.frame': 6 obs. of 5 variables:
# $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4
# $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9
# $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7
# $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4
# $ Species : chr " setosa" " setosa" " setosa" " setosa" ...
更复杂
虚拟数据:
test <- data.frame(a=1:5,
b=as.character(6:10),
c=letters[1:5],
d=factor(letters[6:10]),
e=Sys.time()+(1:5),
stringsAsFactors = FALSE)
This:
dput_small2(test,'df2')
将打印:
df2 <- read.table(sep="\t", text="
a b c d e
1 6 a f 2018-02-15 11:53:17
2 7 b g 2018-02-15 11:53:18
3 8 c h 2018-02-15 11:53:19
4 9 d i 2018-02-15 11:53:20
5 10 e j 2018-02-15 11:53:21", header=TRUE, stringsAsFactors=FALSE)
df2$b <- as.character(df2$b)
df2$d <- factor(df2$d)
df2$e <- as.POSIXct(df2$e)
执行时将返回:
# a b c d e
# 1 1 6 a f 2018-02-15 11:53:17
# 2 2 7 b g 2018-02-15 11:53:18
# 3 3 8 c h 2018-02-15 11:53:19
# 4 4 9 d i 2018-02-15 11:53:20
# 5 5 10 e j 2018-02-15 11:53:21
str(df2)
# 'data.frame': 5 obs. of 5 variables:
# $ a: int 1 2 3 4 5
# $ b: chr "6" "7" "8" "9" ...
# $ c: chr "a" "b" "c" "d" ...
# $ d: Factor w/ 5 levels "f","g","h","i",..: 1 2 3 4 5
# $ e: POSIXct, format: "2018-02-15 11:53:17" "2018-02-15 11:53:18" "2018-02-15 11:53:19" "2018-02-15 11:53:20" ...
all.equal(df2,test)
# [1] "Component “e”: Mean absolute difference: 0.4574251" # only some rounding error
tribble
解决方案
The read.table
选项非常可读,但不是很通用。和tribble
几乎任何数据类型都可以处理(尽管因素需要临时修复)。
该解决方案对于 OP 的示例来说不是很有用,但对于列表列非常有用(请参见下面的示例)。要利用输出,库tibble
是必须的。
就像我的第一个解决方案一样,它是一个包装器dput
,但我不是“dputting”列,而是“dputting”元素。
dput_small3 <- function(df,
name=as.character(substitute(df)),
n= nrow(df),
random=FALSE,
seed = 1){
name
if(random) {
set.seed(seed)
df <- df[sample(1:nrow(df),n),]
} else {
df <- head(df,n)
}
df1 <- lapply(df,function(col) if(is.factor(col)) as.character(col) else col)
dputs <- sapply(df1,function(col){
col_dputs <- sapply(col,function(elt) paste(capture.output(dput(elt)),collapse=""))
max_char <- max(nchar(unlist(col_dputs)))
sapply(col_dputs,function(elt) paste(c(rep(" ",max_char-nchar(elt)),elt),collapse=""))
})
lines <- paste(apply(dputs,1,paste,collapse=", "),collapse=",\n ")
output <- paste0(name," <- tibble::tribble(\n ",
paste0("~",names(df),collapse=", "),
",\n ",lines,"\n)")
cat(output)
sapply(names(df), function(x) if(is.factor(df[[x]])) cat(sep='','\n',name,'$',x,' <- factor(', name,'$',x,')'))
invisible(NULL)
}
dput_small3(dplyr::starwars[c(1:3,11)],"sw",n=6,random=TRUE)
# sw <- tibble::tribble(
# ~name, ~height, ~mass, ~films,
# "Lando Calrissian", 177L, 79, c("Return of the Jedi", "The Empire Strikes Back"),
# "Finis Valorum", 170L, NA_real_, "The Phantom Menace",
# "Ki-Adi-Mundi", 198L, 82, c("Attack of the Clones", "The Phantom Menace", "Revenge of the Sith"),
# "Grievous", 216L, 159, "Revenge of the Sith",
# "Wedge Antilles", 170L, 77, c("Return of the Jedi", "The Empire Strikes Back", "A New Hope"),
# "Wat Tambor", 193L, 48, "Attack of the Clones"
# )