Extracting conserved regions from PhastCons

Commands for extracting conserved regions from 100-way PhastCons conservation tracks. Then BLAST can be used to extract orthologous regions from genome of interest for as many regions as possible.

# download the human gene annotations
wget http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/refGene.txt.gz

# convert human gene annotations to GTF file format
zcat refGene.txt.gz | cut -f 2- | genePredToGtf -utr file stdin stdout > refGene.gtf

# extract exon features from GTF file
cat refGene.gtf | awk '{ if ($3 == "exon") print $0 }' > refGene.exons.gtf

# download the human genome
wget http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz

# download chromosome sizes file
wget http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes

# download MAF alignment files
mkdir maf
for i in `seq 8 22` M X Y; \
do \
rsync -avz --progress rsync://hgdownload.cse.ucsc.edu/goldenPath/hg38/multiz100way/maf/chr${i}.maf.gz ./maf/; \
done

# download phastCons conservation tracks
mkdir phastCons100way
for i in `seq 8 22` M X Y; \
do \
rsync -avz --progress rsync://hgdownload.cse.ucsc.edu/goldenPath/hg38/phastCons100way/hg38.100way.phastCons/chr${i}.phastCons100way.wigFix.gz ./phastCons100way/; \
done

# convert phastCons tracks into BedGraph
cd phas5Cons100way
for file in *.gz; \
do \
base=`basename ${file} .wigFix.gz`; \
echo ${base}; \
gunzip -c ${file} > ${base}.wigFix; \
wigToBigWig ${base}.wigFix ../hg38.chrom.sizes ${base}.bw; \
bigWigToBedGraph ${base}.bw ${base}.bedGraph; \
done

# download the annotation table for 100 vertebrate conserved elements
# 1. Go to http://genome.ucsc.edu/cgi-bin/hgTables
#
# 2. Settings:
# 	clade=Mammal
# 	genome=Human
# 	assembly=hg38
# 	group=Comparative Genomics
# 	track=Conservation
# 	table=100 Vert. El (phastConsElements100way
# 	output format=all fields from selected table
# 	output file=phastConsElements100way.txt
# 	**others**=defaults
# This downloads a BED-like table of conserved elements based on the 100-way vertebrate alignment.
# Here are some stats:
# 	count=10,350,729
# 	bases=162,179,256 (5.32%)
# 	smallest item=1
# 	average item=16
# 	biggest item=3,732
# 	smallest score=186
# 	average score=333
# 	biggest score=1,000

# extract conserved regions that fall outside coding exons (25% or less of the conserved region falls inside an exon)
cat phastConsElements100way.txt | \
tail -n +2 | cut -f 2- | \
awk '{ if ($3 - $2 >= 100) print $0 }' | \
bedtools intersect -wao -a stdin -b refGene.exons.gtf | \
awk -v OFS="\t" -F "\t" '{ print $0, $15 / ($3 - $2) }' | \
awk -F "\t" '{ if ($16 <= 0.25) print $0 }' | cut -f 1-5 \
> phastConsElements100way.noncoding.bed

# Here are some stats:
# 	count=138,744
# 	bases=25,180,979

# extract sequences for conserved regions from human genome
bedtools getfasta -fi hg38.fa -bed phastConsElements100way.noncoding.bed \
> phastConsElements100way.noncoding.fasta