Abstract

Background: When necessary, RNA-sequencing data or polymerase chain reaction (PCR) assays can determine the presence of chromosome Y (ChrY) in samples so that biological variation due to sexual dimorphism can be studied. A prime example is when researchers conduct RNA-sequencing of single embryos, or conceptuses prior to the development of gonads. A recent publication of a complete sequence of ChrY has removed limitations for the development of these procedures in cattle, otherwise imposed by the absence of a ChrY in the reference genome. Using the sequence of the cattle ChrY and transcriptome data, we conducted a systematic search for genes in the ChrY that are exclusively expressed in male tissues. Results: The genes ENSBIXG00000029763, ENSBIXG00000029774, ENSBIXG00000029788, and ENSBIXG00000029892 were consistently expressed across male tissues and lowly expressed or absent in female samples. We observed that the cumulative values of counts per million were 2688-fold greater in males than the equivalent values in female samples. Thus, we deemed these genes suitable for the sexing of samples using RNA-sequencing data. We successfully used this set of genes to infer the sex of 22 cattle blastocysts (eight females and fourteen males). Additionally, the completed sequence of the cattle ChrY has segments in the male-specific region that are not repeated, and we designed a pair of oligonucleotides that targets one of the non-repeated regions in the male-specific sequence of the ChrY. Using this pair of oligonucleotides, in a multiplexed PCR assay with oligonucleotides that anneal to an autosome chromosome, we accurately identified the sex of cattle blastocysts. Conclusions: We developed efficient procedures for the sexing of samples in cattle using either transcriptome data or their DNA. The procedures using RNA-sequencing will greatly benefit researchers who work with samples limited in cell numbers, only sufficient to produce transcriptome data. The oligonucleotides used for the accurate sexing of samples using PCR are transferable to other cattle samples.


Code produced by Fernando Biase. I created this file to permit reproducibility of the findings described in the paper. Please direct questions to Fernando Biase: fbiase at vt.edu
The sequence from the cattle chromosome Y published by Liu et al. 2019 can be obtained using this link CM011803.1
The gtf file with the annotation presented in Liu et al. 2019 is located here Bos_taurus_hybrid.UOA_Angus_1.97.gtf.gz


options(width = 10000)

Bioinformatics

This is a general pipeline used for:

Alignment

#!/bin/bash
path_to_index=/home/fbiase/genome/arsucd_chrY/hisat_index/ars_ucd_1_2_chrY

/home/fbiase/bioinfo/hisat2-2.2.1/hisat2  -p 30 -k 1 -x $path_to_index -1 $read1  -2 $read2  --no-unal -S $output_alignment/alignment_chrY.sam  --summary-file $output_alignment/alignment_summary.txt

Filtering

"converting sam to bam"

samtools sort  -O BAM -l 9 -o $folder/alignment_chrY.bam  $folder/alignment_chrY.sam  &

"indexing"
#####################################################

samtools index   $folder/alignment_chrY.bam   &

"filtering"
#####################################################

samtools view -b -h -F 1796 -o $folder/alignment_chrY.filtered.bam $folder/alignment_chrY.bam  &

"sorting"
#####################################################

samtools sort -O BAM -l 9 -o $folder/alignment_chrY.filtered.sorted.bam  $folder/alignment_chrY.filtered.bam &

"removing duplicates"
##################################################

bammarkduplicates I=$alignment O=$undup_output  level=9 markthreads=1 rmdup=1 index=1 dupindex=0  verbose=0   &

Counting

gtf_file=/home/fbiase/genome/chrY/annotation/Bos_taurus_hybrid.UOA_Angus_1.97_only_chrY.chr.gtf
/subread-2.0.1-source/bin/featureCounts -s 0 -a $gtf_file -o $output -F 'GTF' -t 'exon' -g 'gene_id' --ignoreDup -T 30 -p  $undup_output

Analysis

Libraries

library('edgeR',quietly = TRUE, lib.loc="/usr/lib/R/site-library")
library("DESeq2",quietly = TRUE,lib.loc="/usr/lib/R/site-library")
library('dendextend', quietly = TRUE,lib.loc="/usr/lib/R/site-library")
library('ComplexHeatmap', quietly = TRUE,lib.loc="/usr/lib/R/site-library")
library('dendsort', quietly = TRUE,lib.loc="/usr/lib/R/site-library")
library('circlize', quietly = TRUE,lib.loc="/usr/lib/R/site-library")
library("ggplot2" ,quietly = TRUE, lib.loc="/usr/lib/R/site-library")
library("Rsamtools" ,quietly = TRUE, lib.loc="/usr/lib/R/site-library")
library("kableExtra" ,quietly = TRUE, lib.loc="/usr/lib/R/site-library")
files<-list.files("/mnt/storage/lab_folder/chrY/alignment_chrY_public_data_a", recursive=T, pattern="alignment_chrY.filtered.sorted.undup.bam", full.names = TRUE)
files<-files[grep("female", files, invert = FALSE)]
files<-files[grep("bai", files, invert = TRUE)]

index_stats_females<-data.frame(matrix(nrow=2213))
for (n in 1:length(files)) {
  index_stats<-Rsamtools::idxstatsBam(files[n])
  index_stats<-index_stats[,c(1,3)]
  colnames(index_stats)<-c("seqnames", stringr::str_split_fixed(files[n], "/", 9)[8])
  index_stats_females<-cbind(index_stats_females,index_stats)
}
index_stats_females<-subset(index_stats_females, select= -c(matrix.nrow...2213.))

#round( (index_stats_females[31,seq(from = 2, to = dim(index_stats_females)[2], by = 2)] /colSums(index_stats_females[,seq(from = 2, to = dim(index_stats_females)[2], by = 2)]) )*100 ,2)

dim(index_stats_females[,seq(from = 2, to = dim(index_stats_females)[2], by = 2)])[2]
## [1] 36
ave(colSums(index_stats_females[,seq(from = 2, to = dim(index_stats_females)[2], by = 2)]))[1]
## SRR1178419 
##   17829956
min(round( (index_stats_females[31,seq(from = 2, to = dim(index_stats_females)[2], by = 2)] /colSums(index_stats_females[,seq(from = 2, to = dim(index_stats_females)[2], by = 2)]) )*100 ,2))
## [1] 0.06
max(round( (index_stats_females[31,seq(from = 2, to = dim(index_stats_females)[2], by = 2)] /colSums(index_stats_females[,seq(from = 2, to = dim(index_stats_females)[2], by = 2)]) )*100 ,2))
## [1] 0.16
rowMeans((index_stats_females[31,seq(from = 2, to = dim(index_stats_females)[2], by = 2)] /colSums(index_stats_females[,seq(from = 2, to = dim(index_stats_females)[2], by = 2)]) )*100 )
##         31 
## 0.09959867
files<-list.files("/mnt/storage/lab_folder/chrY/alignment_chrY_public_data_a", recursive=T, pattern="alignment_chrY.filtered.sorted.undup.bam", full.names = TRUE)
files<-files[grep("female", files, invert = TRUE)]
files<-files[grep("bai", files, invert = TRUE)]

index_stats_males<-data.frame(matrix(nrow=2213))
for (n in 1:length(files)) {
  index_stats<-Rsamtools::idxstatsBam(files[n])
  index_stats<-index_stats[,c(1,3)]
  colnames(index_stats)<-c("seqnames", stringr::str_split_fixed(files[n], "/", 9)[8])
  index_stats_males<-cbind(index_stats_males,index_stats)
}

index_stats_males<-subset(index_stats_males, select= -c(matrix.nrow...2213.))

#round( (index_stats_males[31,seq(from = 2, to = dim(index_stats_males)[2], by = 2)] /colSums(index_stats_males[,seq(from = 2, to = dim(index_stats_males)[2], by = 2)]) )*100 ,2)

dim(index_stats_males[,seq(from = 2, to = dim(index_stats_males)[2], by = 2)])[2]
## [1] 30
ave(colSums(index_stats_males[,seq(from = 2, to = dim(index_stats_males)[2], by = 2)]))[1]
## SRR14740610 
##    48700745
min(round( (index_stats_males[31,seq(from = 2, to = dim(index_stats_males)[2], by = 2)] /colSums(index_stats_males[,seq(from = 2, to = dim(index_stats_males)[2], by = 2)]) )*100 ,2))
## [1] 0.05
max(round( (index_stats_males[31,seq(from = 2, to = dim(index_stats_males)[2], by = 2)] /colSums(index_stats_males[,seq(from = 2, to = dim(index_stats_males)[2], by = 2)]) )*100 ,2))
## [1] 0.34
rowMeans((index_stats_males[31,seq(from = 2, to = dim(index_stats_males)[2], by = 2)] /colSums(index_stats_males[,seq(from = 2, to = dim(index_stats_males)[2], by = 2)]) )*100 )
##        31 
## 0.1382073
files<-list.files("/mnt/storage/lab_folder/chrY/alignment_chrY_public_data_a/male/counts_all", recursive=T, pattern="count", full.names = TRUE)
files<-files[grep("summary", files, invert = TRUE)]
files<-files[grep(".sh", files, invert = TRUE)]
#length(files)
count_data_reference_males<-data.frame(matrix(nrow=27607))
for (n in 1:length(files)) {
  count<-read.delim(files[n], header=TRUE, sep= "\t", stringsAsFactors = FALSE, comment.char= "#")
  count<-count[,c(1,7)]
  count_data_reference_males<-cbind(count_data_reference_males,count)
}
rownames(count_data_reference_males)<-count_data_reference_males[,2]
count_data_reference_males<-count_data_reference_males[,seq(from = 3, to = dim(count_data_reference_males)[2], by = 2)]
colnames(count_data_reference_males)<- stringr::str_split_fixed(colnames(count_data_reference_males), "[.]", 9)[,8]


files<-list.files("/mnt/storage/lab_folder/chrY/alignment_chrY_public_data_a/male/counts_chrY_b", recursive=T, pattern="count", full.names = TRUE)
files<-files[grep("summary", files, invert = TRUE)]
files<-files[grep(".sh", files, invert = TRUE)]
#length(files)
count_data_chrY_males<-data.frame(matrix(nrow=192))
for (n in 1:length(files)) {
  count<-read.delim(files[n], header=TRUE, sep= "\t", stringsAsFactors = FALSE, comment.char= "#")
  count<-count[,c(1,7)]
  count_data_chrY_males<-cbind(count_data_chrY_males,count)
}
rownames(count_data_chrY_males)<-count_data_chrY_males[,2]
count_data_chrY_males<-count_data_chrY_males[,seq(from = 3, to = dim(count_data_chrY_males)[2], by = 2)]
colnames(count_data_chrY_males)<- stringr::str_split_fixed(colnames(count_data_chrY_males), "[.]", 9)[,8]

count_data_reference_chrY_males<-rbind(count_data_reference_males, count_data_chrY_males)
## [1] 119  64
## [1] 42 64
group<-data.frame( row.names= c(colnames(count_data_reference_chrY_females), colnames(count_data_reference_chrY_males)),  sample= c(colnames(count_data_reference_chrY_females), colnames(count_data_reference_chrY_males)) , sex=rep(c("female", "male"), c(dim(count_data_reference_chrY_females)[2], dim(count_data_reference_chrY_males)[2])))

group$sex<-as.factor(group$sex)

design<-model.matrix(~  group$sex)

count_data_chrY_edger<-DGEList(count=count_data_reference_chrY_males_females)
count_data_chrY_edger<-estimateDisp(count_data_chrY_edger,design, robust=TRUE)

count_data_chrY_edger_QLFit <- glmQLFit(count_data_chrY_edger, design,robust=TRUE)
count_data_chrY_edger_QLFit <- glmQLFTest(count_data_chrY_edger_QLFit , coef="group$sexmale")
count_data_chrY_edger_QLFit_edgeR_results_QLF<- topTags(count_data_chrY_edger_QLFit, adjust.method = "fdr", n=Inf)$table

count_data_chrY_edger_QLFit_edgeR_results_QLF<-count_data_chrY_edger_QLFit_edgeR_results_QLF[with(count_data_chrY_edger_QLFit_edgeR_results_QLF, order(-F)),]

###

count_data_chrY_DeSeq2<-DESeqDataSetFromMatrix(countData=count_data_reference_chrY_males_females,colData=group, design= ~ sex)

# Wald method
count_data_chrY_DeSeq2_Wald<-DESeq(count_data_chrY_DeSeq2, test="Wald",fitType='local')
count_data_chrY_DeSeq2_Wald<-results(count_data_chrY_DeSeq2_Wald, contrast=c("sex",  "male", "female"), pAdjustMethod="fdr", tidy=TRUE)
count_data_chrY_DeSeq2_DESeq_Wald<-count_data_chrY_DeSeq2_Wald[with(count_data_chrY_DeSeq2_Wald, order(pvalue)),]

count_data_chrY_edgeR_DeSeq2<-merge(count_data_chrY_edger_QLFit_edgeR_results_QLF,count_data_chrY_DeSeq2_DESeq_Wald, by.x='row.names', by.y='row')

count_data_chrY_edgeR_DeSeq2<-count_data_chrY_edgeR_DeSeq2[with(count_data_chrY_edgeR_DeSeq2, order(-F)),]

Table 1

Table 1. Top ten differentially expressed genes between male and female tissues located in the cattle chromosome Y.

count_data_chrY_edger_QLFit_edgeR_results_QLF[1:10,]  %>% kableExtra::kbl( ) %>% kable_classic(full_width = F)
logFC logCPM F PValue FDR
ENSBIXG00000029763 11.269543 14.75076 2118.4299 0 0
ENSBIXG00000029788 9.710046 14.55233 1046.0118 0 0
ENSBIXG00000029774 10.984218 14.47392 813.8676 0 0
ENSBIXG00000029892 8.463869 15.50362 707.0806 0 0
ENSBIXG00000029986 9.591261 13.15171 429.2606 0 0
ENSBIXG00000029770 7.971359 16.16133 341.2628 0 0
ENSBIXG00000029886 -1.951345 18.42128 320.0573 0 0
ENSBIXG00000029889 6.598548 15.13753 228.7008 0 0
ENSBIXG00000029864 6.730399 15.56694 210.2657 0 0
ENSBIXG00000030021 -2.459648 14.85990 172.2769 0 0

Figure 1B

col_fun = colorRamp2(c(min(cpm_data_reference_chrY_males_females), max(cpm_data_reference_chrY_males_females)), c("#FFFFFF", "#1F305E"))

NCBI_GEO<-c("GSE55435", "GSE55435", "GSE55435", "GSE55435", "GSE55435", "GSE55435", "GSE55435", "GSE55435", "GSE55435", "GSE55435", "GSE55435", "GSE55435", "GSE55435", "GSE55435",  "GSE55435", "GSE55435", "GSE55435", "GSE55435", "GSE55435", "GSE55435" , "GSE55435", "GSE55435","GSE55435", "GSE55435", "GSE55435", "GSE55435", "GSE55435", "GSE55435" , "GSE55435", "GSE55435", "GSE55435", "GSE192530", "GSE192530", "GSE192530", "GSE192530", "GSE192530","GSE176219", "GSE176219", "GSE176219", "GSE176219", "GSE176219", "GSE176219", "GSE176219", "GSE176219", "GSE196974", "GSE196974", "GSE196974", "GSE196974", "GSE196974", "GSE196974", "GSE196974", "GSE196974", "GSE196974", "GSE196974", "GSE196974", "GSE196974", "GSE196974", "GSE196974", "GSE196974", "GSE196974", "GSE128075", "GSE128075", "GSE128075",  "GSE128075")

samples<-c("hypothalamus_1", "hypothalamus_2", "hypothalamus_3", "hypothalamus_4","pituitary_1", "pituitary_2", "pituitary_3", "pituitary_4", "ovary_1", "ovary_2", "ovary_3", "ovary_4", "uterus_1", "uterus_2", "uterus_3", "uterus_4", "endometrium_1", "endometrium_2", "endometrium_3", "muscle_1", "muscle_2", "muscle_3", "muscle_4", "fat_1" , "fat_2" , "fat_3" , "fat_4" , "liver_1", "liver_2", "liver_3", "liver_4", "PWBC_1", "PWBC_2", "PWBC_3", "PWBC_4", "PWBC_5",  "testis_1", "testis_2", "testis_3", "testis_4", "testis_5", "testis_6", "testis_7", "testis_8", "muscle_1", "muscle_2", "muscle_3", "muscle_4", "muscle_5", "muscle_6", "muscle_7", "muscle_8", "liver_1", "liver_2", "liver_3", "liver_4", "liver_5", "liver_6", "liver_7", "liver_8", "prostate_1", "epididymis_1", "epididymis_2", "testis_2")


colnames(cpm_data_reference_chrY_males_females)<- paste(samples,NCBI_GEO)

colnames(cpm_data_reference_chrY_males_females)<- paste(samples)


#colnames(cpm_data_reference_chrY_males_females)



Heatmap(cpm_data_reference_chrY_males_females, 
                name= "CPM",
               cluster_rows = dendsort(fastcluster::hclust(dist(cpm_data_reference_chrY_males_females), method = "single")), 
               cluster_columns = dendsort(fastcluster::hclust(dist(t(cpm_data_reference_chrY_males_females)), method = "single")), 
               show_row_names = TRUE,
               col = col_fun,
               show_heatmap_legend = TRUE,
               top_annotation = HeatmapAnnotation(
                          "sex"= factor(rep(c("female", "male"), c(dim(count_data_reference_chrY_females)[2], dim(count_data_reference_chrY_males)[2]))), "dataset"= factor(NCBI_GEO) , which = c("column"), col=list("sex"=setNames(c("#cc79a7","#56b3e9"), c("female","male")),"dataset"=setNames(c("#c65b80", "#77a451", "#9765ca", "#c7733b", "#6c93c5"),c("GSE55435", "GSE176219", "GSE196974", "GSE128075", "GSE192530"))) , show_annotation_name = FALSE, show_legend = TRUE))

Figure 1C

colnames(cpm_data_reference_chrY_males_females)<- paste(samples,NCBI_GEO)

cpm_data_reference_chrY_males_females<-as.data.frame(cpm_data_reference_chrY_males_females)
cpm_data_reference_chrY_males_females_graph<-cpm_data_reference_chrY_males_females[rownames(cpm_data_reference_chrY_males_females) %in% c("ENSBIXG00000029788","ENSBIXG00000029892","ENSBIXG00000029774", "ENSBIXG00000029763"),]

cpm_data_reference_chrY_males_females_graph$gene<-rownames(cpm_data_reference_chrY_males_females_graph)
cpm_data_reference_chrY_males_females_graph<-reshape::melt(cpm_data_reference_chrY_males_females_graph,id.vars="gene")

ggplot(cpm_data_reference_chrY_males_females_graph, aes(x=reorder(variable, value, sum), y=value, fill=gene )) + 
    geom_bar(position="stack", stat="identity")+
    #geom_hline(yintercept=500, linetype="dashed", color = "red")+
    scale_y_continuous(name="Counts per million reads")+
    scale_x_discrete(name=NULL)+
    scale_fill_manual(values=c("#015fe3","#c57b00","#8a2f6f","#00a56a","#ffaae3","#ff8060"))+
    #facet_grid(~sex,scales="free")+
    theme_classic()+
    theme(
    axis.title  = element_text(size=15, color="black"),
    axis.text.x = element_text(size=13, color="black", angle=90,vjust=0.5, hjust=1),
    axis.text.y = element_text(size=15, color="black"),
    legend.text = element_text(size=15, color="black"),
    legend.title = element_text(size=15, color="black"),
    legend.position = c(0.2, 0.8)
)

cpm_data_reference_chrY_males_females<-as.data.frame(cpm_data_reference_chrY_males_females)
cpm_data_reference_chrY_males_females_graph<-cpm_data_reference_chrY_males_females[rownames(cpm_data_reference_chrY_males_females) %in% c("ENSBIXG00000029788","ENSBIXG00000029892","ENSBIXG00000029774", "ENSBIXG00000029763"),]
summary(colSums(cpm_data_reference_chrY_males_females_graph[,1:dim(count_data_reference_chrY_females)[2]]))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.03041 0.00000 0.36775
summary(colSums(cpm_data_reference_chrY_males_females_graph[,37:dim(cpm_data_reference_chrY_males_females_graph)[2]]))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   30.92   37.94   46.76   80.65  111.24  250.85
## [1] 119  22

##Figure 1B

cpm_data_reference_chrY_blastocysts<-as.data.frame(cpm_data_reference_chrY_blastocysts)
cpm_data_reference_chrY_blastocysts_graph<-cpm_data_reference_chrY_blastocysts[rownames(cpm_data_reference_chrY_blastocysts) %in% c("ENSBIXG00000029788","ENSBIXG00000029892","ENSBIXG00000029774", "ENSBIXG00000029763"),]

cpm_data_reference_chrY_blastocysts_graph$gene<-rownames(cpm_data_reference_chrY_blastocysts_graph)
cpm_data_reference_chrY_blastocysts_graph<-reshape::melt(cpm_data_reference_chrY_blastocysts_graph,id.vars="gene")



ggplot(cpm_data_reference_chrY_blastocysts_graph, aes(x=reorder(variable, value, sum), y=value, fill=gene )) + 
    geom_bar(position="stack", stat="identity")+
    #geom_hline(yintercept=500, linetype="dashed", color = "red")+
    scale_y_continuous(name="Counts per million reads")+
    scale_x_discrete(name=NULL)+
    scale_fill_manual(values=c("#015fe3","#c57b00","#8a2f6f","#00a56a","#ffaae3","#ff8060"))+
    #facet_grid(~sex,scales="free")+
    theme_classic()+
    theme(
    axis.title  = element_text(size=15, color="black"),
    axis.text.x = element_text(size=13, color="black", angle=90,vjust=0.5, hjust=1),
    axis.text.y = element_text(size=15, color="black"),
    legend.text = element_text(size=15, color="black"),
    legend.title = element_text(size=15, color="black"),
    legend.position = c(0.2, 0.8)
)

cpm_data_reference_chrY_blastocysts<-as.data.frame(cpm_data_reference_chrY_blastocysts)
cpm_data_reference_chrY_blastocysts_graph<-cpm_data_reference_chrY_blastocysts[rownames(cpm_data_reference_chrY_blastocysts) %in% c("ENSBIXG00000029788","ENSBIXG00000029892","ENSBIXG00000029774", "ENSBIXG00000029763"),]

cpm_data_reference_chrY_blastocysts_graph
##                    blastocyst_1 blastocyst_10 blastocyst_11 blastocyst_12 blastocyst_13 blastocyst_14 blastocyst_15 blastocyst_16 blastocyst_18 blastocyst_19 blastocyst_2 blastocyst_20 blastocyst_21 blastocyst_22 blastocyst_23 blastocyst_3 blastocyst_4 blastocyst_5 blastocyst_6 blastocyst_7 blastocyst_8 blastocyst_9
## ENSBIXG00000029763            0      18.97552      6.061083      11.92225      16.87044      9.758044      11.71454             0             0             0            0      13.06061    0.00000000      7.238592      15.68339            0            0     9.008661    1.5926710     16.49501     10.23573     8.045926
## ENSBIXG00000029774            0      13.81169     21.483837      22.84052      29.46628     11.051928      10.75545             0             0             0            0      27.95786    0.00000000     16.930944      22.96496            0            0    18.017322    0.2654452     13.44391     19.97079    19.674804
## ENSBIXG00000029788            0      48.65200     38.826934      40.59839     112.22264     60.057797      16.64698             0             0             0            0      60.26926    0.00000000     36.745056      89.00830            0            0    48.617393   10.6178064     40.28406     55.57332    80.459261
## ENSBIXG00000029892            0      96.30856     38.106806      76.74163     116.72522    106.098513      73.84956             0             0             0            0      71.35717    0.05925871     84.593375      90.53591            0            0   110.062337    9.5560257     65.74168     84.22223    91.522410
colSums(cpm_data_reference_chrY_blastocysts_graph)
##  blastocyst_1 blastocyst_10 blastocyst_11 blastocyst_12 blastocyst_13 blastocyst_14 blastocyst_15 blastocyst_16 blastocyst_18 blastocyst_19  blastocyst_2 blastocyst_20 blastocyst_21 blastocyst_22 blastocyst_23  blastocyst_3  blastocyst_4  blastocyst_5  blastocyst_6  blastocyst_7  blastocyst_8  blastocyst_9 
##    0.00000000  177.74778193  104.47865989  152.10279287  275.28457472  186.96628283  112.96652985    0.00000000    0.00000000    0.00000000    0.00000000  172.64489876    0.05925871  145.50796604  218.19255399    0.00000000    0.00000000  185.70571280   22.03194818  135.96466597  170.00207663  199.70240131