machine learning.Rmd

---
title: "machine learning"
output: 
  bookdown::html_document2:
    code_folding: show
    number_sections: no
    toc: yes
    toc_depth: 6
    toc_float: yes
---

```{r,echo=FALSE,warning=FALSE,echo=FALSE}
library(randomForest)
library(glmnet)
library(Matrix)
library(tidyverse)
library(h2o)
library(corrr)
library(Hmisc)
library(ggpubr)
library(metafor)
library(forestploter)
```


# Predict the GC%

Load the data.

```{r}
pathway <- read.csv('./excel/results_with_env.csv')
data =pathway[,4:ncol(pathway)]
attach(data)
```


```{r}
selected_columns <- c()

# 获取数据表的列数
num_columns <- ncol(data)

# 循环遍历从第五列开始的其他列
for (i in 2:num_columns) {
  # 获取当前列的名称
  col <- colnames(data)[i]
  
  # 计算相关性并进行显著性检验
  result <- cor.test(data$GC, data[, col])
  
  # 检查p值是否小于0.05
  # 检查p值是否小于0.05，并且非零值数量大于一半
  if (result$p.value < 0.05 && sum(data[, col] != 0) > (nrow(data) / 2)) {
    selected_columns <- c(selected_columns, col)
  }
}
selected_columns
```

```{r}
selected_columns <- data.frame(Column = character(), P_Value = numeric(), Confidence_Interval = character())

# 获取数据表的列数
num_columns <- ncol(data)

# 循环遍历从第二列开始的其他列
for (i in 2:num_columns) {
  # 获取当前列的名称
  col <- colnames(data)[i]
  
  # 计算相关性并进行显著性检验
  result <- cor.test(data$GC, data[, col])
  
  # 检查p值是否小于0.05，并且非零值数量大于一半
  if (result$p.value < 0.05 && sum(data[, col] != 0) > (nrow(data) / 2)) {
    # 创建一个包含结果的行
    row <- data.frame(Column = col,
                      P_Value = result$p.value,
                      Confidence_Interval = paste(result$conf.int, collapse = " - "))
    
    # 将行添加到选定列的数据框中
    selected_columns <- rbind(selected_columns, row)
  }
}

selected_columns

```

```{r}
selected_columns$Confidence_Interval
```

```{r}
# 将区间拆分为两个数值
selected_columns$Confidence_Interval <- strsplit(selected_columns$Confidence_Interval, " - ")
selected_columns$Lower_Bound <- sapply(selected_columns$Confidence_Interval, function(x) as.numeric(x[1]))
selected_columns$Upper_Bound <- sapply(selected_columns$Confidence_Interval, function(x) as.numeric(x[2]))

# 移除原始的Confidence_Interval列
selected_columns$Confidence_Interval <- NULL
# 生成森林图
ggplot(selected_columns, aes(x = Column, y = P_Value, ymin = Lower_Bound, ymax = Upper_Bound)) +
  geom_pointrange() +
  coord_flip() +  # 将图形翻转为水平方向
  theme_classic()  # 使用经典主题
```


```{r}
x=KT
model1 <- lm(GC~x)
plot(GC,x)
summary(model1)
```

# Auto-Machine learning

```{r}
# 初始化h2o环境
h2o.init()

# 转换为H2OFrame
h2o_data <- as.h2o(data)
# 分割数据
# 将数据划分为训练集和验证集
splits <- h2o.splitFrame(h2o_data, ratios = 0.8)
train <- splits[[1]]
test  <- splits[[2]]

# 指定预测目标和输入特征
y <- "GC"  # 预测目标
x <- setdiff(names(h2o_data), y)  # 输入特征，去除预测目标的列

# 使用H2O的AutoML函数来训练模型
aml <- h2o.automl(x = x, y = y,
                  training_frame = train,
                  leaderboard_frame = test,
                  max_models = 20,
                  seed = 1)

# 打印出AutoML的排行榜
lb <- aml@leaderboard
print(lb)

# 使用最佳模型进行预测
preds <- h2o.predict(aml@leader, newdata = test)

# 打印出预测结果s
print(preds)
```


```{r}
# Load necessary libraries
library(glmnet)
library(caret)

# Load the data
data <- read.csv("excel/results_with_env.csv")

# Prepare the data
X <- data[, -(1:4)]
y <- data$GC

# Standardize the features
X_scaled <- scale(X)

# Split the data into training set and test set
set.seed(42)
trainIndex <- createDataPartition(y, p = .8, 
                                  list = FALSE, 
                                  times = 1)
X_train <- X_scaled[trainIndex,]
y_train <- y[trainIndex]
X_test <- X_scaled[-trainIndex,]
y_test <- y[-trainIndex]

# Define the model
model <- cv.glmnet(as.matrix(X_train), y_train, alpha = 1)

# Get the lambda that gives minimum mean cross-validated error
lambda_min <- model$lambda.min

# Fit the model using the optimal lambda
final_model <- glmnet(as.matrix(X_train), y_train, alpha = 1, lambda = lambda_min)

# Predict on the test set
predictions <- predict(final_model, newx = as.matrix(X_test))

# Calculate R squared
SSE <- sum((predictions - y_test)^2)
SST <- sum((y_test - mean(y_test))^2)
R_squared <- 1 - SSE / SST
print(R_squared)

# Print the coefficients
print(coef(final_model))

```