data <- read.table('e://kg.txt',
header = TRUE,
sep = '\t')
data <- data %>% as_tibble()
data %>% attach()
data %>% ggplot(aes(cm, kg))+ geom_line()
data %>% ggplot(aes(age,cm))+ geom_line()
data %>% ggplot(aes(age,kg))+ geom_line()
# age 与 height 与weight 关系:
data[1:3] %>% cor() %>% corrplot::corrplot(method = "color",
addCoef.col = "grey")
lm_data <- data %>% lm(kg~I(cm^3),.)
lm_data %>% summary()
lm_data
plot(cm^3,kg,xaxt='n');
axis(1,at=cm^3,labels=cm);
abline(lm_data)
# ggplot拟合
data %>% ggplot(aes(cm^3,kg)) +
geom_point() +
geom_smooth()
# https://zhuanlan.zhihu.com/p/94372177
# https://www.jianshu.com/p/a081a791ae03
# https://cloud.tencent.com/developer/article/1674211
# https://www3.nd.edu/~steve/computing_with_data/2_Motivation/motivate_ht_wt.html?spm=a2c4e.11153940.blogcont603256.20.333b1d6fYOsiOK
# 载入数据,数据集在这里下载:https://github.com/johnmyleswhite/ML_for_Hackers/blob/master/02-Exploration/data/01_heights_weights_genders.csv
library(tidyverse)
ht_weight_df <- read.table("e://01_heights_weights_genders.txt",
header = TRUE,
sep = "\t") %>%
as_tibble()
ht_weight_df %>% mice::md.pattern()
# 绘图查看相关性
ht_weight_df %>% select(-1) %>%
cor() %>% corrplot::corrplot(method = "color",
addCoef.col = "grey")
ht_weight_df %>% select(-1) %>% sample_frac(0.1) %>%
plot(cex = 0.1)
# 拟合检验线性相关
lm_ht_weight <- lm(Weight ~ Height, data = ht_weight_df)
lm_ht_weight %>% summary()
lm_ht_weight %>% abline()
# 分性别对照
ht_weight_df %>% group_by(Gender) %>%
dplyr::summarise( round( mean( Height)* 2.54))
# subset(Gender == )也可选取组
# fivenum() 不能[2]、select(2)
# sapply()不能$变量、select(2)
# psych::describe() 不能[2]
# pastecs::stat.desc()、Hmisc::describe()、summary() 都可以
# plyr::ddply(.(Gender), function(df) summary(df$Height))从原数据分组求值
# 查看分布
par(mfrow = c(1,1))
ht_weight_df %>% subset(Gender == "Male") %>% select(Height) %>%
unlist() %>% as.numeric() %>%
density() %>% plot(type = "h", col = 4, ann = FALSE) # main被屏蔽
ht_weight_df %>% subset(Gender == "Female") %>% select(Height) %>%
unlist() %>% as.numeric() %>%
density() %>% lines(col = 2)
title(main = "Height By Gender")
abline(col = c(1, 2),
lty = 3,
v = c(
mean(ht_weight_df %>% subset(Gender == "Male") %>%
select(Height) %>% unlist()),
mean(ht_weight_df %>% subset(Gender == "Female") %>%
select(Height) %>% unlist())
))
ht_weight_df %>% ggplot(aes(x = Height, colour = Gender)) +
geom_density()
ht_weight_df %>% ggplot(aes(sample = Height)) +
geom_point(stat = "qq") + facet_wrap(~Gender) # stat_qq requires sample
# 分类数据线性拟合
ht_weight_df %>% ggplot(aes(x = Height, y = Weight, colour = Gender)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "lm", formula = y ~ x)
lm_ht_wt_by_gender <- lm(Weight ~ Height * Gender, data = ht_weight_df)
lm_ht_wt_by_gender %>% summary()
- 如果观察人的一生,身高、体重的变化曲线,会是什么样的呢?