Load in datasets for topic model and merge them into one seurat object

Download datasets from GEO

Some of the data included in this model come from publically available datasets from published studies, housed in GEO. Run the following code from the home directory of this repo to download the necessary datasets.

mkdir data/external_scRNA
cd data/external_scRNA

#download Ji et al 2018 data of Chondrocytes from OA patients run through STRT single cell RNA protocol from GEO (Series GSE104782)
mkdir Jietal2018
cd Jietal2018
wget #raw count data
wget # clustering information for cells in the raw data

#download Wu et al 2021 data of iPSC-Chondrocyte chondrogenic pellet differentiation time course from GEO (Series GSE160625). Data include barcodes, genes, and count matrices from samples taken from iPSCs and samples from 7, 14, 28, and 42 day chondrogenic pellets treated with chondrogenic media containing a WNT and MITF inhibitor
cd ..
mkdir Wuetal2021
cd Wuetal2021
wget # iPSC barcodes
wget # iPSC genes
wget # iPSC count matrix
wget # day 7 pellet barcodes
wget # day 7 pellet genes
wget # day 7 pellet count matrix
wget # day 14 pellet barcodes
wget # day 14 pellet genes
wget  # day 14 pellet count matrix
wget # day 28 pellet barcodes
wget # day 28 pellet genes
wget # day 28 pellet count matrix
wget # day 42 pellet barcodes
wget # day 42 pellet genes
wget # day 42 pellet count matrix

#download data for Chou et al 2020 from GEO (Sample GSM4626766) for chondrocytes from Minimally damaged outer lateral tibial (oLT) plateau cartilage isolated from one male individual
cd ..
mkdir Chou_et_al2020
cd Chou_et_al2020
wget # barcodes
wget # genes
wget # count matrix

Load in datasets

Load in the datasets and create Seurat objects for each of them. Add metadata about the cell type they were assigned in their original contexts. Merge the Seurat object into one large object.

Registered S3 method overwritten by 'cli':
  method     from    
  print.boxx spatstat
── Attaching packages ────────────────────────────────── tidyverse 1.3.0 ──
✓ ggplot2 3.3.3     ✓ purrr   0.3.4
✓ tibble  3.0.4     ✓ dplyr   1.0.2
✓ tidyr   1.1.2     ✓ stringr 1.4.0
✓ readr   1.3.1     ✓ forcats 0.4.0
── Conflicts ───────────────────────────────────── tidyverse_conflicts() ──
x tidyr::expand() masks Matrix::expand()
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
x tidyr::pack()   masks Matrix::pack()
x tidyr::unpack() masks Matrix::unpack()

#iPSC-Chondrocyte data from the current study
ANT1.2 <- readRDS("data/ANT1_2.rds")
ANT1.2 <- Seurat::AddMetaData(ANT1.2, "iPSC-Chondrocyte", = "Cell.Type")
[1] 33538  1815
#Chondrocytes from OA patients ( run through STRT protocol
# load raw count data
ji_chondro <- read.table("data/external_scRNA/Jietal2018/GSE104782_allcells_UMI_count.txt.gz", header = T)
ji_chondro_genes <- ji_chondro[,1]
ji_chondro_data <- ji_chondro[,-1]
ji_chondro <- ji_chondro_data
rownames(ji_chondro) <- ji_chondro_genes
# load clustering assignment data from paper
ji_chondro_metadata <- read_excel("data/external_scRNA/Jietal2018/GSE104782_Table_Cell_quality_information_and_clustering_information.xlsx")
# subset count data to include only those cells that were assigned to a cluster labeled as being a chondrocyte in the original paper
ji_chondro.filtered <- ji_chondro[,!$Cluster)]
cluster.ident <- "ji_chondrocyte"
# Create seurat object from chondrocytes data
ji_chondro.seurat <- CreateSeuratObject(ji_chondro.filtered, project="Chondrocytes")
Warning: Feature names cannot have underscores ('_'), replacing with dashes
ji_chondro.seurat <- Seurat::AddMetaData(ji_chondro.seurat, cluster.ident, = "Cell.Type")
[1] 24153  1464
#iPSC chondrogenic cells from directed differentiation time course  from iPSC chondrogenic pellet culture. 
count_data_wu_d7 <- readMM("data/external_scRNA/Wuetal2021/GSM4876136_C59_D7_matrix.mtx.gz")
genes_wu_d7 <- read_tsv("data/external_scRNA/Wuetal2021/GSM4876136_C59_D7_genes.tsv.gz", col_names = F)
Parsed with column specification:
  X1 = col_character(),
  X2 = col_character()
barcodes_wu_d7 <- "data/external_scRNA/Wuetal2021/GSM4876136_C59_D7_barcodes.tsv.gz", col_names = F))
Parsed with column specification:
  X1 = col_character()
rownames(count_data_wu_d7) <- genes_wu_d7$X2
colnames(count_data_wu_d7) <- barcodes_wu_d7$X1
wu_d7_seurat <- CreateSeuratObject(counts = count_data_wu_d7, project = "d7")
Warning: Non-unique features (rownames) present in the input matrix, making

Warning: Feature names cannot have underscores ('_'), replacing with dashes
wu_d7_seurat <- Seurat::AddMetaData(wu_d7_seurat, "Wu_chondrogenic_pellet_d7", = "Cell.Type")

count_data_wu_d14 <- readMM("data/external_scRNA/Wuetal2021/GSM4876137_C59_D14_matrix.mtx.gz")
genes_wu_d14 <- read_tsv("data/external_scRNA/Wuetal2021/GSM4876137_C59_D14_genes.tsv.gz", col_names = F)
Parsed with column specification:
  X1 = col_character(),
  X2 = col_character()
barcodes_wu_d14 <- "data/external_scRNA/Wuetal2021/GSM4876137_C59_D14_barcodes.tsv.gz", col_names = F))
Parsed with column specification:
  X1 = col_character()
rownames(count_data_wu_d14) <- genes_wu_d14$X2
colnames(count_data_wu_d14) <- barcodes_wu_d14$X1
wu_d14_seurat <- CreateSeuratObject(counts = count_data_wu_d14, project = "d14")
Warning: Non-unique features (rownames) present in the input matrix, making

Warning: Feature names cannot have underscores ('_'), replacing with dashes
wu_d14_seurat <- Seurat::AddMetaData(wu_d14_seurat, "Wu_chondrogenic_pellet_d14", = "Cell.Type")

count_data_wu_d28 <- readMM("data/external_scRNA/Wuetal2021/GSM4876138_C59_D28_matrix.mtx.gz")
genes_wu_d28 <- read_tsv("data/external_scRNA/Wuetal2021/GSM4876138_C59_D28_genes.tsv.gz", col_names = F)
Parsed with column specification:
  X1 = col_character(),
  X2 = col_character()
barcodes_wu_d28 <- "data/external_scRNA/Wuetal2021/GSM4876138_C59_D28_barcodes.tsv.gz", col_names = F))
Parsed with column specification:
  X1 = col_character()
rownames(count_data_wu_d28) <- genes_wu_d28$X2
colnames(count_data_wu_d28) <- barcodes_wu_d28$X1
wu_d28_seurat <- CreateSeuratObject(counts = count_data_wu_d28, project = "d28")
Warning: Non-unique features (rownames) present in the input matrix, making

Warning: Feature names cannot have underscores ('_'), replacing with dashes
wu_d28_seurat <- Seurat::AddMetaData(wu_d28_seurat, "Wu_chondrogenic_pellet_d28", = "Cell.Type")

count_data_wu_d42 <- readMM("data/external_scRNA/Wuetal2021/GSM4876139_C59_D42_matrix.mtx.gz")
genes_wu_d42 <- read_tsv("data/external_scRNA/Wuetal2021/GSM4876139_C59_D42_genes.tsv.gz", col_names = F)
Parsed with column specification:
  X1 = col_character(),
  X2 = col_character()
barcodes_wu_d42 <- "data/external_scRNA/Wuetal2021/GSM4876139_C59_D42_barcodes.tsv.gz", col_names = F))
Parsed with column specification:
  X1 = col_character()
rownames(count_data_wu_d42) <- genes_wu_d42$X2
colnames(count_data_wu_d42) <- barcodes_wu_d42$X1
wu_d42_seurat <- CreateSeuratObject(counts = count_data_wu_d42, project = "d42")
Warning: Non-unique features (rownames) present in the input matrix, making

Warning: Feature names cannot have underscores ('_'), replacing with dashes
wu_d42_seurat <- Seurat::AddMetaData(wu_d42_seurat, "Wu_chondrogenic_pellet_d42", = "Cell.Type")

wu_combined <- merge(wu_d7_seurat, y = c(wu_d14_seurat, wu_d28_seurat, wu_d42_seurat), project = "Combined.common")
Warning in CheckDuplicateCellNames(object.list = objects): Some cell names
are duplicated across objects provided. Renaming to enforce unique cell
wu_combined[[""]] <- PercentageFeatureSet(wu_combined, pattern = "^MT-")
wu_combined <- subset(wu_combined, subset = < 25)
[1] 45924  9434
#Chondrocytes from healthy oLT of a male human with OA (outer lateral tibia)
chou_chondro_counts <- readMM("data/external_scRNA/Chou_et_al2020/GSM4626766_OA_oLT_113.matrix.mtx.gz")
chou_chondro_genes <- read_tsv("data/external_scRNA/Chou_et_al2020/GSM4626766_OA_oLT_113.genes.tsv.gz", col_names = F)
Parsed with column specification:
  X1 = col_character(),
  X2 = col_character()
chou_chondro_barcodes <-"data/external_scRNA/Chou_et_al2020/GSM4626766_OA_oLT_113.barcodes.tsv.gz", col_names = F))
Parsed with column specification:
  X1 = col_character()
#make Seurat object
rownames(chou_chondro_counts) <- chou_chondro_genes$X2
colnames(chou_chondro_counts) <- chou_chondro_barcodes$X1

chou_chondro_seurat <- CreateSeuratObject(counts = chou_chondro_counts, project = "Chondrocytes") %>% 
  AddMetaData("chou_chondrocyte", = "Cell.Type")
Warning: Non-unique features (rownames) present in the input matrix, making

Warning: Feature names cannot have underscores ('_'), replacing with dashes
[1] 32738  6200
#iPSCs/MSCs/ipsc-chondrocytes/ipsc-osteoblasts from Genevieve Housman. Combine all the datasets located in the data directory. Each of these datasets contain single cell data from cells from the same human individual from which iPSCs were created. 10x single cell RNA sequencing data from iPSCs, MSCs, iPSC-Osteoblasts, and iPSC-chondrocytes are included after filtering for barcodes with fewer than 25% reads coming from mitochondria.
GH_iPSC <- readRDS("data/external_scRNA/GH_cells/gh_ipsc.rds")
GH_iPSCMSC <- readRDS("data/external_scRNA/GH_cells/gh_msc.rds")
GH_iPSCChond <- readRDS("data/external_scRNA/GH_cells/gh_ipscChond.rds")
GH_iPSCOsteo <- readRDS("data/external_scRNA/GH_cells/gh_ipscOsteo.rds")
#merge all the datasets into one seurat object
GH_merged <- merge(GH_iPSC, y = c(GH_iPSCMSC, GH_iPSCChond, GH_iPSCOsteo),
                add.cell.ids = c("ipsc", "msc", "chondrogh", "osteo"), project = "ghousman")
GH_merged <- Seurat::AddMetaData(GH_merged, "Ghousman", = "orig.ident")
[1] 19377 10815
#Human Liver samples run through 10x ( This is a subset of the entire dataset, which is from whole liver homogenate. Following the example from the original paper, scViz was used to subset the data for clusters that were called hepatocytes in the original study (subset contains a total of 3490 cells)
liver_data <- readRDS("data/external_scRNA/humanLiverSubset_hepatocytes.rds")
liver_data <- Seurat::AddMetaData(liver_data, "Hepatocyte", = "Cell.Type")
[1] 20007  3490

Merge and Save the combined data

# Merge Datasets

# Merge the seurat objects into one large one, keeping metadata about which study they originated in as well as their assigned cell type.
Merged_external_data <- merge(ANT1.2, y = c(ji_chondro.seurat, chou_chondro_seurat, GH_merged, liver_data), project = "Merged_external")
Merged_external_data <- merge(Merged_external_data, wu_combined, project = "Merged_external")
[1] 61987 33218
saveRDS(Merged_external_data, "data/external_scRNA/merged_external_scRNA.rds")

