🧬 R语言时间序列蛋白质组学聚类分析完整教程
在蛋白质组学研究中,时间序列分析是理解蛋白质动态变化模式的重要方法。本文将详细介绍如何使用R语言的TCseq包进行时间序列蛋白质组学聚类分析。
🎯 分析目标
通过时间序列聚类分析,我们可以:
- 📈 识别蛋白质在不同时间点的表达模式
 
- 🔍 发现具有相似变化趋势的蛋白质群组
 
- 📊 可视化蛋白质动态变化规律
 
- 🧩 为后续功能富集分析提供基础
 
📦 所需R包
1 2 3 4 5 6 7 8 9 10 11 12
   | 
 
 
 
  library(TCseq)         library(readxl)        library(pheatmap)      library(dplyr)         library(ggplot2)       library(gridExtra)     library(patchwork)    
 
  | 
 
⚙️ 参数配置
🔧 基本参数设置
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
   | 
 
 
 
  work_dir <- "~/mice33/work/lab/深圳三院250730" setwd(work_dir)
 
  input_file <- "2. 数据结果/2. 差异统计/1. 蛋白定量统计.xlsx" sheet_name <- "蛋白定量统计"
 
  cluster_num <- 6           random_seed <- 123         algo_method <- 'cm'       
 
  time_points <- list(   T1 = c("A1", "A2", "A3"),     T2 = c("B1", "B2", "B3"),     T3 = c("C1", "C2", "C3"),     T4 = c("D1", "D2", "D3")    )
 
  | 
 
🎨 可视化参数
1 2 3 4 5 6
   |  figure_width <- 15 figure_height <- 10 single_plot_width <- 8 single_plot_height <- 6 language <- "EN"          
 
  | 
 
📊 数据读取与预处理
📁 数据读取函数
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
   | read_and_process_data <- function(file_path, sheet_name, time_points) {   cat("正在读取数据...\n")         if (!file.exists(file_path)) {     stop("输入文件不存在:", file_path)   }         protein_data <- read_excel(file_path, sheet = sheet_name)         abundance_cols <- grep("Abundances.*Normalized", colnames(protein_data), value = TRUE)   if (length(abundance_cols) == 0) {     abundance_cols <- grep("Abundance", colnames(protein_data), value = TRUE)   }         sample_matrix <- as.matrix(protein_data[, abundance_cols, drop = FALSE])         if ("Gene Symbol" %in% colnames(protein_data)) {     rownames(sample_matrix) <- make.unique(protein_data$`Gene Symbol`)   } else if ("Protein" %in% colnames(protein_data)) {     rownames(sample_matrix) <- make.unique(protein_data$Protein)   } else {     rownames(sample_matrix) <- paste0("Protein_", 1:nrow(sample_matrix))   }         colnames(sample_matrix) <- gsub(".*:", "", colnames(sample_matrix))   colnames(sample_matrix) <- trimws(colnames(sample_matrix))      return(sample_matrix) }
  | 
 
📈 时间点均值计算
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
   | calculate_timepoint_means <- function(sample_matrix, time_points) {   cat("计算时间点均值...\n")      mean_list <- list()   for (i in seq_along(time_points)) {     tp_name <- names(time_points)[i]     tp_samples <- time_points[[i]]               available_samples <- intersect(tp_samples, colnames(sample_matrix))     if (length(available_samples) == 0) {       stop("时间点", tp_name, "的样本在数据中未找到")     }          mean_list[[tp_name]] <- rowMeans(sample_matrix[, available_samples, drop = FALSE], na.rm = TRUE)     cat("时间点", tp_name, "使用", length(available_samples), "个样本\n")   }      mean_matrix <- do.call(cbind, mean_list)   colnames(mean_matrix) <- names(time_points)      return(mean_matrix) }
  | 
 
🧹 数据预处理
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
   | preprocess_data <- function(mean_matrix) {   cat("数据预处理...\n")         cat("缺失值数量:", sum(is.na(mean_matrix)), "\n")   cat("零值数量:", sum(mean_matrix == 0, na.rm = TRUE), "\n")         complete_rows <- complete.cases(mean_matrix)   mean_matrix_clean <- mean_matrix[complete_rows, ]         mean_matrix_log <- log2(mean_matrix_clean + 1)      return(list(clean = mean_matrix_clean, log = mean_matrix_log)) }
  | 
 
🎯 聚类分析
📊 TCseq聚类分析
1 2 3 4 5 6 7 8 9 10 11 12 13
   | perform_clustering <- function(data_matrix, k = 6, algorithm = 'cm', seed = 123) {   cat("开始聚类分析...\n")      set.seed(seed)         tcseq_result <- timeclust(data_matrix,                             algo = algorithm,                                k = k,                                           standardize = TRUE)        return(tcseq_result) }
  | 
 
🔍 聚类算法说明
| 算法类型 | 
代码 | 
特点 | 
适用场景 | 
| 模糊C均值 | 
'cm' | 
允许软聚类,蛋白质可属于多个类别 | 
推荐,适合生物数据 | 
| K均值 | 
'km' | 
硬聚类,每个蛋白质只属于一个类别 | 
快速,适合大数据 | 
| 层次聚类 | 
'hc' | 
基于距离的聚类方法 | 
适合探索性分析 | 
📈 结果可视化
🎨 时间序列图
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
   | create_timeseries_plots <- function(tcseq_result, language = "EN") {   cat("创建时间序列图...\n")         p <- timeclustplot(tcseq_result,                       value = 'z-score',                      cols = 3,                      axis.line.size = 0.8,                      title.size = 12)         cluster_counts <- table(tcseq_result@cluster)   for(i in 1:length(p)) {     p[[i]] <- p[[i]] +        labs(title = paste("Cluster", i, "(n =", cluster_counts[i], ")"),            x = "Time Points",            y = "Z-score Normalized Expression") +       theme_minimal()   }      return(p) }
  | 
 
🔥 聚类中心热图
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
   | create_heatmap <- function(tcseq_result, language = "EN") {   cluster_counts <- table(tcseq_result@cluster)   heatmap_data <- tcseq_result@centers         rownames(heatmap_data) <- paste0("Cluster ", 1:nrow(heatmap_data),                                     "\n(n=", cluster_counts, ")")   colnames(heatmap_data) <- paste("Time Point", 1:ncol(heatmap_data))         heatmap_plot <- pheatmap(heatmap_data,                            cluster_rows = FALSE,                            cluster_cols = FALSE,                           display_numbers = TRUE,                           number_format = "%.2f",                           main = "Time-series Expression Patterns (Z-score)",                           color = colorRampPalette(c("blue", "white", "red"))(50),                           cellwidth = 35,                           cellheight = 35,                           silent = TRUE)      return(list(plot = heatmap_plot, data = heatmap_data)) }
  | 
 
📊 综合折线图
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
   | create_comprehensive_plot <- function(tcseq_result, language = "EN") {   cluster_counts <- table(tcseq_result@cluster)         cluster_centers_df <- data.frame(     Cluster = factor(rep(1:nrow(tcseq_result@centers), each = ncol(tcseq_result@centers))),     TimePoint = rep(1:ncol(tcseq_result@centers), nrow(tcseq_result@centers)),     Expression = as.vector(t(tcseq_result@centers)),     Count = rep(cluster_counts, each = ncol(tcseq_result@centers))   )      cluster_centers_df$Cluster_Label <- paste0("Cluster ", cluster_centers_df$Cluster,                                              " (n=", cluster_centers_df$Count, ")")         comprehensive_plot <- ggplot(cluster_centers_df,                                aes(x = TimePoint, y = Expression,                                    color = Cluster_Label,                                    group = Cluster_Label)) +     geom_line(size = 1.2, alpha = 0.8) +     geom_point(size = 3, alpha = 0.9) +     scale_x_continuous(breaks = 1:max(cluster_centers_df$TimePoint),                        labels = paste("Time Point", 1:max(cluster_centers_df$TimePoint))) +     labs(title = "Time-series Expression Patterns of All Clusters",          subtitle = "Based on Fuzzy C-means Clustering",          x = "Time Points",          y = "Z-score Normalized Expression",          color = "Clusters") +     theme_minimal() +     theme(plot.title = element_text(size = 14, hjust = 0.5, face = "bold"),           plot.subtitle = element_text(size = 12, hjust = 0.5)) +     scale_color_brewer(type = "qual", palette = "Set2")      return(comprehensive_plot) }
  | 
 
📁 结果整理与保存
📊 结果数据整理
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
   | organize_results <- function(tcseq_result, data_clean, data_log, language = "EN") {   protein_names <- rownames(data_log)   cluster_assignments <- tcseq_result@cluster   cluster_counts <- table(cluster_assignments)         cluster_results <- data.frame(     Protein = protein_names,     Cluster = cluster_assignments,     stringsAsFactors = FALSE   )         for (i in 1:ncol(data_log)) {     cluster_results[paste0("Timepoint", i, "_zscore")] <- data_log[, i]   }         for (i in 1:ncol(data_clean)) {     cluster_results[paste0("Timepoint", i, "_original")] <- data_clean[, i]   }         cluster_summary <- data.frame(     Cluster = 1:nrow(tcseq_result@centers),     Count = as.numeric(cluster_counts)   )         for (i in 1:ncol(tcseq_result@centers)) {     cluster_summary[paste0("T", i, "_Center")] <- round(tcseq_result@centers[, i], 3)   }      return(list(results = cluster_results, summary = cluster_summary)) }
  | 
 
💾 批量保存结果
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
   | save_all_results <- function(tcseq_result, organized_results, plots, heatmap_info,                              comprehensive_plot, integrated_plot, results_dir) {         if (!dir.exists(results_dir)) {     dir.create(results_dir)   }         write.csv(organized_results$results,              file.path(results_dir, "Complete_Clustering_Results.csv"),              row.names = FALSE)   write.csv(organized_results$summary,              file.path(results_dir, "Cluster_Pattern_Summary.csv"),              row.names = FALSE)         cluster_counts <- table(tcseq_result@cluster)   for(i in 1:length(cluster_counts)) {     cluster_proteins <- organized_results$results[organized_results$results$Cluster == i, ]     write.csv(cluster_proteins,                file.path(results_dir, paste0("Cluster_", i, "_proteins.csv")),                row.names = FALSE)   }         ggsave(file.path(results_dir, "Comprehensive_Line_Plot.pdf"),           plot = comprehensive_plot, width = 12, height = 8)         pdf(file.path(results_dir, "Cluster_Centers_Heatmap.pdf"), width = 10, height = 8)   print(heatmap_info$plot)   dev.off() }
   | 
 
🚀 完整执行流程
🔄 主程序函数
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
   | main_analysis <- function() {   cat("开始时间序列蛋白质组学聚类分析...\n")         sample_matrix <- read_and_process_data(input_file, sheet_name, time_points)   mean_matrix <- calculate_timepoint_means(sample_matrix, time_points)   processed_data <- preprocess_data(mean_matrix)         tcseq_result <- perform_clustering(processed_data$log, cluster_num, algo_method, random_seed)         plots <- create_timeseries_plots(tcseq_result, language)   heatmap_info <- create_heatmap(tcseq_result, language)   comprehensive_plot <- create_comprehensive_plot(tcseq_result, language)         organized_results <- organize_results(tcseq_result, processed_data$clean, processed_data$log)         save_all_results(tcseq_result, organized_results, plots, heatmap_info,                     comprehensive_plot, integrated_plot, results_dir)         print_summary(organized_results, tcseq_result)      return(list(     tcseq_result = tcseq_result,     organized_results = organized_results,     plots = plots,     comprehensive_plot = comprehensive_plot   )) }
 
  results <- main_analysis()
  | 
 
📊 结果解读
🔍 聚类模式识别
分析完成后,通常会得到以下几种典型的时间序列模式:
- 📈 上升型:蛋白质表达量随时间逐渐增加
 
- 📉 下降型:蛋白质表达量随时间逐渐减少
 
- 🔺 早期峰型:在早期时间点达到最高表达
 
- 🔻 晚期峰型:在晚期时间点达到最高表达
 
- 📊 稳定型:表达量在时间序列中保持相对稳定
 
- 🌊 波动型:表达量呈现周期性变化
 
📈 生物学意义
不同的聚类模式可能反映:
- 早期应答基因:快速响应刺激的蛋白质
 
- 晚期应答基因:延迟响应的调节蛋白质
 
- 持续表达基因:维持细胞基本功能的蛋白质
 
- 周期性调节基因:参与细胞周期调控的蛋白质
 
🎯 使用建议
✨ 参数优化
聚类数量选择:
1 2 3 4 5
   |  for(k in 4:8) {   result_k <- timeclust(data_matrix, k = k)    }
 
  | 
 
 
算法选择:
1 2 3 4 5
   |  algorithms <- c('cm', 'km', 'hc') results_compare <- lapply(algorithms, function(alg) {   timeclust(data_matrix, algo = alg, k = 6) })
 
  | 
 
📋 质量控制
- 检查输入数据的质量和完整性
 
- 确保样本分组信息正确
 
- 验证聚类结果的生物学合理性
 
- 进行功能富集分析验证聚类结果
 
🔗 扩展分析
🧬 功能富集分析
1 2 3 4 5 6 7 8 9 10 11 12 13
   |  library(clusterProfiler)
  for(i in 1:cluster_num) {   cluster_genes <- organized_results$results[     organized_results$results$Cluster == i, "Protein"   ]         go_result <- enrichGO(gene = cluster_genes,                         OrgDb = org.Hs.eg.db,                         ont = "BP") }
 
  | 
 
📊 共表达网络分析
1 2 3 4 5 6 7 8
   |  library(WGCNA)
 
  cor_matrix <- cor(t(processed_data$log))
 
 
 
 
  | 
 
📚 相关资源
- TCseq包文档:Bioconductor TCseq
 
- 时间序列分析理论:《Time Series Analysis and Its Applications》
 
- 蛋白质组学数据分析:《Computational Proteomics》