Skip to main content

Tile generation of big data

Project description

AWS_ES_DOMAIN=52.23.165.123:9872

# Create tiles from coolers
workon py3
assembly=hg19
FILENAME=/Dixon2015-H1_hESC-HindIII-allreps-filtered.5kb.cool
#for FILENAME in Dixon2015-H1_NP-HindIII-allreps-filtered.50kb.cool Dixon2015-H1_NP-HindIII-allreps-filtered.5kb.cool;
for FILENAME in Rao2014-NHEK-MboI-allreps-filtered.50kb.cool Rao2014-K562-MboI-allreps-filtered.50kb.cool Rao2014-IMR90-MboI-allreps-filtered.50kb.cool Rao2014-HUVEC-MboI-allreps-filtered.50kb.cool Rao2014-HMEC-MboI-allreps-filtered.50kb.cool Rao2014-GM12878-MboI-allreps-filtered.50kb.cool;
do
DATASET_NAME=${assembly}/${FILENAME}
FILEPATH=/data/coolers/${DATASET_NAME}
python ~/projects/cooler/scripts/dump_matrix_txt.py ${FILEPATH} --balanced --join --out - | awk '{ if (NF == 6) print $1 "\t" $2 "\t" $4 "\t" $5 "\t" 0; else print $1 "\t" $2 "\t" $4 "\t" $5 "\t" $7; }' | grep -v start1 | chr_pos_to_genome_pos.py -c 1,2:3,4 -a $assembly | make_triangular.py | sort -k1,1n -k2,2n - | gzip > ${FILEPATH}.genome.sorted.gz
done;


AWS_ES_DOMAIN=52.45.229.11:9872
ASSEMBLY=hg19
RESOLUTION=50000
for DATASET_NAME in Rao2014-NHEK-MboI-allreps-filtered.50kb.cool.genome.sorted.gz Rao2014-K562-MboI-allreps-filtered.50kb.cool.genome.sorted.gz Rao2014-IMR90-MboI-allreps-filtered.50kb.cool.genome.sorted.gz Rao2014-HUVEC-MboI-allreps-filtered.50kb.cool.genome.sorted.gz Rao2014-HMEC-MboI-allreps-filtered.50kb.cool.genome.sorted.gz Rao2014-GM12878-MboI-allreps-filtered.50kb.cool.genome.sorted.gz;
do
INDEX_NAME=${ASSEMBLY}/${DATASET_NAME}
#INDEX_NAME=${DATASET_NAME,,}/tiles
echo $INDEX_NAME
FILENAME=coolers/${ASSEMBLY}/${DATASET_NAME}
FILEPATH=~/data/${FILENAME}
#curl -XDELETE "http://${AWS_ES_DOMAIN}/${DATASET_NAME,,}"
zcat ${FILEPATH} | /usr/bin/time python scripts/make_single_threaded_tiles.py --assembly ${ASSEMBLY} -b 256 -r ${RESOLUTION} --elasticsearch-url ${AWS_ES_DOMAIN}/${INDEX_NAME} --num-threads 4 --triangular --log-file clodius.log # 16:48:26
done;


###################################################################################################################################
# Creating autocomplete tiles

#AWS_ES_DOMAIN=52.23.165.123:9872
# wget https://raw.githubusercontent.com/pkerpedjiev/gene-citation-counts/master/all_gene_counts.tsv
# mv all_gene_counts.tsv ~/data/genbank-data/human/
AWS_ES_DOMAIN=52.45.229.11:9872
workon py3
ASSEMBLY=hg19
python scripts/make_autocomplete_list.py -c refseqid,chr,strand,txStart,txEnd,genomeTxStart,genomeTxEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,geneName,count ~/data/hg19/genbank-output/refgene-count/ -n geneName -i count --elasticsearch-url ${AWS_ES_DOMAIN}/${ASSEMBLY}.autocomplete --reverse-importance

################################################################################################################################
# Gene Density Data
FILEPATH=~/data/encode/hg19/gencodeDensity
awk '{ print $1, $2, $1, $3, $4}' ${FILEPATH}.bedGraph | chr_pos_to_genome_pos.py -c 1,2:3,4 -a hg19 | /usr/bin/time sort -k1,1n -k2,2n - | gzip > ${FILEPATH}.genome.sorted.gz

AWS_ES_DOMAIN=52.23.165.123:9872
ASSEMBLY=hg19
RESOLUTION=1
DATASET_NAME=gencodeDensity.genome.sorted.gz
INDEX_NAME=${ASSEMBLY}/${DATASET_NAME}
#INDEX_NAME=${DATASET_NAME,,}/tiles
#echo $INDEX_NAME
FILENAME=encode/${ASSEMBLY}/${DATASET_NAME}
FILEPATH=~/data/${FILENAME}
##curl -XDELETE "http://${AWS_ES_DOMAIN}/${DATASET_NAME,,}"
zcat ${FILEPATH} | /usr/bin/time python scripts/make_single_threaded_tiles.py --assembly ${ASSEMBLY} -b 256 -r ${RESOLUTION} --expand-range 1,2 --ignore-0 -k 1 -v 3 --elasticsearch-url ${AWS_ES_DOMAIN}/${INDEX_NAME} --log-file clodius.log --max-queue-size 2000 --print-status # 16:48:26

################################################################################################################################
### Max's Data

IDENTIFIER=UMB5144
AWS_ES_DOMAIN=52.23.165.123:9872
ASSEMBLY=hg19
BASE_PATH=~/data/clodius-input/hg19
FILEPATH=${BASE_PATH}/${IDENTIFIER}.all_bins.tsv
INDEX_NAME=${ASSEMBLY}/${ASSEMBLY}.${IDENTIFIER}.all_bins.sorted.genome.gz
RESOLUTION=1000


################### Prepare

rm ${FILEPATH}
for i in $(seq -f "%02g" 21) X Y;
do
pv -cN chr${i} ${BASE_PATH}/${IDENTIFIER}/b1000/UMB5144-final-${i}.b1000.bin | grep -v start | awk -v chrom=${i} '{ gsub("^0*", "", chrom); print "chr" chrom "\t" $1 "\t" "chr" chrom "\t" $2 "\t" $3 "\t" $4 "\t" $5 }' | chr_pos_to_genome_pos.py -c 1,2:3,4 -a hg19 >> ${FILEPATH}
done;
pv -cN sorting... ${FILEPATH} | sort -k1,1n -k2,2n - | gzip > ${FILEPATH}.genome.sorted.gz

################### Tile Ratios

AWS_ES_DOMAIN=52.23.165.123:9872

zcat ${FILEPATH}.genome.sorted.gz | head -n 1000 | /usr/bin/time python scripts/make_single_threaded_tiles.py --assembly ${ASSEMBLY} -b 256 -r ${RESOLUTION} --expand-range 1,2 -k 1 -v 3 --columnfile-path /tmp/maxs-tiles --log-file clodius.log --max-queue-size 2000 --print-status # 16:48:26

pv -cN tiling ${FILEPATH}.genome.sorted.gz | zcat | /usr/bin/time python scripts/make_single_threaded_tiles.py --assembly ${ASSEMBLY} -b 256 -r ${RESOLUTION} --expand-range 1,2 -k 1 -v 3,4 --elasticsearch-url ${AWS_ES_DOMAIN}/${INDEX_NAME} --log-file clodius.log --max-queue-size 2000

############# Tile CNV calls

DATASET_NAME=UMB5144__B1000_l03.BICseq.out
CNV_PATH=${BASE_PATH}/${IDENTIFIER}/b1000/lambda_03/${DATASET_NAME}
pv -cN chr_pos_to_genome_pos $CNV_PATH | grep -v 'start' | awk '{print $1, $2, $1, $3, $4, $5, $6, $7, $8}' | chr_pos_to_genome_pos.py -c 1,2:3,4 -a hg19 > ${CNV_PATH}.genome

INPUT_FILE=${CNV_PATH}.genome
AWS_ES_DOMAIN=52.45.229.11:9872
ASSEMBLY=hg19
INDEX_NAME=${ASSEMBLY}/${DATASET_NAME}
/usr/bin/time python scripts/make_tiles.py --elasticsearch-nodes ${AWS_ES_DOMAIN} --elasticsearch-path $INDEX_NAME -v count --position start --end-position end -c chrom,start,end,binNum,observed,expected,log2_copyRatio,pValue --max-zoom 18 -i pValue --importance --max-entries-per-tile 16 --assembly ${ASSEMBLY} ${INPUT_FILE}



################################################################################################################################
### Cooler to tiles

python scripts/cooler_to_tiles.py /data/tmp/UNTR.1kb.multires.cool --assembly mm9 --max-zoom 3 --elasticsearch-url 52.23.165.123:9872/hg19.1/mm9.UNTR.1kb.multires.cool



################################################################################################################################
### Tile BigWig

workon py2
python scripts/tile_bigWig.py ~/data/clodius-input/hg19/E014-H3K27me3.fc.signal.bigwig --assembly hg19

################################################################################################################################
###### Gene Annotations

AWS_ES_DOMAIN=52.23.165.123:9872
INPUT_FILE=~/data/hg19/genbank-output/refgene-count-minus
DATASET_NAME=hg19/refgene-tiles-minus
OUTPUT_DIR=~/data/${DATASET_NAME}
/usr/bin/time python scripts/make_tiles.py --elasticsearch-nodes ${AWS_ES_DOMAIN} --elasticsearch-path $DATASET_NAME -v count --position genomeTxStart --end-position genomeTxEnd -c refseqid,chr,strand,txStart,txEnd,genomeTxStart,genomeTxEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,geneName,count,uid --max-zoom 18 -i count --importance --reverse-importance --max-entries-per-tile 16 $INPUT_FILE





















###
#
AWS_ES_DOMAIN=52.23.165.123:9872
ASSEMBLY=hg19
RESOLUTION=50000
DATASET_NAME=Dixon2015-H1_hESC-HindIII-allreps-filtered.50kb.cool.unbalanced.genome.sorted.gz
INDEX_NAME=${ASSEMBLY}/${DATASET_NAME}
#INDEX_NAME=${DATASET_NAME,,}/tiles
echo $INDEX_NAME
FILENAME=coolers/${ASSEMBLY}/${DATASET_NAME}
FILEPATH=~/data/${FILENAME}
#curl -XDELETE "http://${AWS_ES_DOMAIN}/${DATASET_NAME,,}"
zcat ${FILEPATH} | /usr/bin/time python scripts/make_single_threaded_tiles.py --assembly ${ASSEMBLY} -b 256 -r ${RESOLUTION} --elasticsearch-url ${AWS_ES_DOMAIN}/${INDEX_NAME} --num-threads 4 --triangular --log-file clodius.log --max-queue-size 2000 # 16:48:26

AWS_ES_DOMAIN=52.23.165.123:9872
ASSEMBLY=hg19
RESOLUTION=5000
DATASET_NAME=Dixon2015-H1_hESC-HindIII-allreps-filtered.5kb.cool.unbalanced.genome.sorted.gz
INDEX_NAME=${ASSEMBLY}/${DATASET_NAME}
#INDEX_NAME=${DATASET_NAME,,}/tiles
echo $INDEX_NAME
FILENAME=coolers/${ASSEMBLY}/${DATASET_NAME}
FILEPATH=~/data/${FILENAME}
#curl -XDELETE "http://${AWS_ES_DOMAIN}/${DATASET_NAME,,}"
zcat ${FILEPATH} | /usr/bin/time python scripts/make_single_threaded_tiles.py --assembly ${ASSEMBLY} -b 256 -r ${RESOLUTION} --elasticsearch-url ${AWS_ES_DOMAIN}/${INDEX_NAME} --num-threads 4 --triangular --log-file clodius.log --max-queue-size 2000 # 16:48:26

### Gene density


ASSEMBLY=hg19
DATASET_NAME=gencodeDensity.bedGraph.txt
INDEX_NAME=hg19.1/${DATASET_NAME}
FILENAME=encode/${ASSEMBLY}/${DATASET_NAME}
FILEPATH=~/data/${FILENAME}
python scripts/process_file.py --assembly hg19 --type bedgraph $FILEPATH

#zcat ~/data/clodius-input/hg19/geneDensity3.bedGraph.txt.genome.sorted.gz | head -n 50 | /usr/bin/time pypy scripts/make_single_threaded_tiles.py --min-pos 1 --max-pos 10000000 -b 256 -r 1 --expand-range 1,2 --ignore-0 -k 1 -v 3 --elasticsearch-url 52.23.165.123:9872/hg19.1/geneDensity3.bedgraph.txt.genome.sorted.gz.1
zcat ${FILEPATH}.genome.sorted.gz | /usr/bin/time python scripts/make_single_threaded_tiles.py --assembly hg19 -b 256 -r 1 --expand-range 1,2 --ignore-0 -k 1 -v 3 --elasticsearch-url 52.23.165.123:9872/${INDEX_NAME} --print-status

### Gene Information

AWS_ES_DOMAIN=54.197.186.181:9872
INPUT_FILE=~/data/hg19/genbank-output/refgene-count-minus
INDEX_NAME=hg19/refgene-tiles-minus
OUTPUT_DIR=~/data/${DATASET_NAME}

curl -XDELETE "http://${AWS_ES_DOMAIN}/${INDEX_NAME}/_query" -d '{
"query" : {
"match_all" : {}
}
}'
/usr/bin/time python scripts/make_tiles.py --elasticsearch-nodes ${AWS_ES_DOMAIN} --elasticsearch-path $INDEX_NAME -v count --position genomeTxStart --end-position genomeTxEnd -c refseqid,chr,strand,txStart,txEnd,genomeTxStart,genomeTxEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,geneName,count,uid --max-zoom 18 -i count --importance --reverse-importance --max-entries-per-tile 16 --assembly hg19 ${INPUT_FILE}
#/usr/bin/time python scripts/make_tiles.py --elasticsearch-nodes ${AWS_ES_DOMAIN} --elasticsearch-path $INDEX_NAME -v count --position genomeTxStart --end-position genomeTxEnd -c refseqid,chr,strand,txStart,txEnd,genomeTxStart,genomeTxEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,geneName,count,uid --max-zoom 18 -i count --importance --reverse-importance --max-entries-per-tile 16 --assembly hg19 /tmp/perm2

### Nometonome

AWS_ES_DOMAIN=52.23.165.123:9872
ASSEMBLY=GCF_000005845.2_ASM584v2_genomic

cat ~/projects/nometonome/contacts/GCF_000005845.2_ASM584v2_genomic.22.contacts.genome /usr/bin/time pypy scripts/make_single_threaded_tiles.py --min-pos 1,1 --max-pos 4641652,4641652 -b 256 -r 1 -v 3 --elasticsearch-url 52.23.165.123:9872/hg19.1/GCF_000005845.2_ASM584v2_genomic.22.contacts.genome

[
{
"chromInfoPath": "//s3.amazonaws.com/pkerp/data/hg19/chromInfo.txt",
"domain": [
0,
4641652
],
"viewStyle": {
"float": "left",
"padding": "5px",
"width": "100%"
},
"tracks": [
{
"source": "//52.23.165.123:9872/hg19.1/GCF_000005845.2_ASM584v2_genomic.22.contacts.genome",
"type": "heatmap",
"height": 300
}
],
"zoomLock": 0
}
]






























AWS_ES_DOMAIN=52.23.165.123:9872
ASSEMBLY=hg19
DATASET_NAME=refGeneBed.bedGraph.txt.genome.sorted.gz
INDEX_NAME=hg19.x/${DATASET_NAME}
FILEPATH=/data/encode/${ASSEMBLY}/${DATASET_NAME}

zcat ${FILEPATH} | /usr/bin/time pypy scripts/make_single_threaded_tiles.py --assembly hg19 -b 256 -r 1 --expand-range 1,2 --ignore-0 -k 1 -v 3 --elasticsearch-url ${AWS_ES_DOMAIN}/${INDEX_NAME}

##########################################################################################
### HiC data
#########################################################################################


### Smaller test set

### Real data set

FILENAME=rao_et_al/HMEC/5kb_resolution_intrachromosomal/chr1/MAPQGE30/chr1_5kb.RAWobserved
FILENAME=rao_et_al/HMEC/5kb_resolution_intrachromosomal/chr1/MAPQGE30/chr1_5kb.RAWobserved
FILENAME=rao_et_al/HUVEC/5kb_resolution_intrachromosomal/chr1/MAPQGE30/chr1_5kb.RAWobserved

FILENAME=rao_et_al/IMR90/5kb_resolution_intrachromosomal/chr1/MAPQGE30/chr1_5kb.RAWobserved
FILENAME=rao_et_al/GM12878_primary/5kb_resolution_intrachromosomal/chr1/MAPQGE30/chr1_5kb.RAWobserved

DATASET_NAME=hg19/Dixon2015-H1hESC_ES-HindIII-allreps-filtered.1kb.genome.gz
FILENAME=coolers/${DATASET_NAME}
FILEPATH=~/data/${FILENAME}

zcat $FILEPATH > ${FILEPATH}.mirrored
zcat $FILEPATH | awk '{ print $2 "\t" $1 "\t" $3; }' >> ${FILEPATH}.mirrored

head -n 40000000 ${FILEPATH}.mirrored.shuffled > ${FILEPATH}.short

#SPARK_HOME_DIR=/Users/peter/Downloads/spark-1.6.1
#SPARK_HOME_DIR=/home/ubuntu/apps/spark-1.6.1-bin-hadoop2.6
SPARK_HOME_DIR=~/spark-home

/usr/bin/time ${SPARK_HOME_DIR}/bin/spark-submit scripts/make_tiles.py -v count -p pos1,pos2 -c pos1,pos2,count -i count -r 1000 -b 256 --max-zoom 20 --output-format dense --use-spark ${FILEPATH}.short --elasticsearch-nodes localhost:9200 --elasticsearch-path test_shorter/tiles

# Run locally
# OUTPUT_DIR=${FILEPATH}.short.tiles; rsync -a --delete blank/ $OUTPUT_DIR; mkdir -p $OUTPUT_DIR; /usr/bin/time python scripts/make_tiles.py -o $OUTPUT_DIR -v count -p pos1,pos2 -c pos1,pos2,count -i count -r 1000 -b 256 --max-zoom 20 --output-format dense ${FILEPATH}.short

/usr/bin/time ${SPARK_HOME_DIR}/bin/spark-submit scripts/make_tiles.py -v count -p pos1,pos2 -c pos1,pos2,count -i count -r 1000 -b 256 --max-zoom 20 --output-format dense --use-spark --elasticsearch-nodes localhost:9200 --elasticsearch-path ${DATASET_NAME} ${FILEPATH}.mirrored.shuffled

#OUTPUT_DIR=${FILEPATH}.tiles; rsync -a --delete blank/ $OUTPUT_DIR; mkdir -p $OUTPUT_DIR; /usr/bin/time ${SPARK_HOME_DIR}/bin/spark-submit scripts/make_tiles.py -o $OUTPUT_DIR -v count -p pos1,pos2 -c pos1,pos2,count -i count -r 1000 -b 256 --max-zoom 20 --output-format dense --use-spark ${FILEPATH}.mirrored

#find $OUTPUT_DIR -name "*.json" | xargs chmod a+r

aws s3 sync --region us-west-2 ~/data/${FILENAME}.tiles s3://pkerp/data/${FILENAME}.tiles


##########################################################################################
### Gene annotations
#########################################################################################
ASSEMBLY=mm9

AWS_ES_DOMAIN=52.23.165.123:9872
DATASET_NAME=refgene-tiles-minus
INPUT_FILE=~/data/${ASSEMBLY}/genbank-output/refgene-count-minus
INDEX_NAME=hg19.1/${ASSEMBLY}.${DATASET_NAME}
/usr/bin/time python scripts/make_tiles.py --elasticsearch-nodes ${AWS_ES_DOMAIN} --elasticsearch-path $INDEX_NAME -v count --position genomeTxStart --end-position genomeTxEnd -c refseqid,chr,strand,txStart,txEnd,genomeTxStart,genomeTxEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,geneName,count,uid --max-zoom 18 -i count --importance --reverse-importance --max-entries-per-tile 16 $INPUT_FILE


DATASET_NAME=refgene-tiles-plus
INPUT_FILE=~/data/${ASSEMBLY}/genbank-output/refgene-count-plus
INDEX_NAME=hg19.1/${ASSEMBLY}.${DATASET_NAME}
/usr/bin/time python scripts/make_tiles.py --elasticsearch-nodes ${AWS_ES_DOMAIN} --elasticsearch-path $INDEX_NAME -v count --position genomeTxStart --end-position genomeTxEnd -c refseqid,chr,strand,txStart,txEnd,genomeTxStart,genomeTxEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,geneName,count,uid --max-zoom 18 -i count --importance --reverse-importance --max-entries-per-tile 16 $INPUT_FILE

#rsync -a --delete blank/ $OUTPUT_DIR; mkdir -p $OUTPUT_DIR; /usr/bin/time python scripts/make_tiles.py -o $OUTPUT_DIR -v count --position genomeTxStart --end-position genomeTxEnd -c refseqid,chr,strand,txStart,txEnd,genomeTxStart,genomeTxEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,geneName,count,uid --max-zoom 5 -i count --importance --reverse-importance --max-entries-per-tile 16 $INPUT_FILE
#rsync -avzP $OUTPUT_DIR/ ~/projects/goomba/.tmp/jsons/${DATASET_NAME}


#rsync -a --delete blank/ $OUTPUT_DIR; mkdir -p $OUTPUT_DIR; /usr/bin/time python scripts/make_tiles.py -o $OUTPUT_DIR -v count --position genomeTxStart --end-position genomeTxEnd -c refseqid,chr,strand,txStart,txEnd,genomeTxStart,genomeTxEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,geneName,count,uid --max-zoom 5 -i count --importance --reverse-importance --max-entries-per-tile 16 $INPUT_FILE
#rsync -avzP $OUTPUT_DIR/ ~/projects/goomba/.tmp/jsons/${DATASET_NAME}

#aws s3 sync $OUTPUT_DIR s3://pkerp/$OUTPUT_PART
#aws s3 sync $OUTPUT_DIR s3://pkerp/$OUTPUT_PART

##########################################################################################
## Wiggle Tracks from
## BEDGraph files
##########################################################################################

DATASET_NAME=hg19/E116-DNase.fc.signal.bigwig
FILENAME=ENCODE/${DATASET_NAME}
FILEPATH=~/data/${FILENAME}

#bigWigToBedGraph ${FILEPATH} ${FILEPATH}.bedGraph # 60 seconds
#cat ${FILEPATH}.bedGraph | awk '{print $1,$2,$1,$3,$4}' | chr_pos_to_genome_pos.py -e 4 > ${FILEPATH}.bedGraph.genome
head -n 1000000 ${FILEPATH}.bedGraph.genome > ${FILEPATH}.short

SPARK_HOME_DIR=~/spark-home

OUTPUT_DIR=${FILEPATH}.short.tiles; rsync -a --delete blank/ $OUTPUT_DIR; mkdir -p $OUTPUT_DIR; /usr/bin/time ${SPARK_HOME_DIR}/bin/spark-submit scripts/make_tiles.py -v value -c pos1,pos2,value --position pos1 --range pos1,pos2 --range-except-0 value -i value --resolution 1 --bins-per-dimension 64 --max-zoom 20 --use-spark ${FILEPATH}.short -o $OUTPUT_DIR

aws s3 sync $OUTPUT_DIR s3://pkerp/data/served/$DATASET_NAME

#OUTPUT_DIR=${FILEPATH}.short.tiles; rsync -a --delete blank/ $OUTPUT_DIR; mkdir -p $OUTPUT_DIR; /usr/bin/time ${SPARK_HOME_DIR}/bin/spark-submit scripts/make_tiles.py -v value -c chrom,pos1,pos2,value --position pos1 --range pos1,pos2 -i value --resolution 1 --bins-per-dimension 64 --max-zoom 20 --use-spark ${FILEPATH}.short --elasticsearch-nodes localhost:9200 --elasticsearch-path test_short/bed

SPARK_HOME_DIR=~/spark-home
DATASET_NAME=sample_data/E116-DNase.fc.signal.bigwig.bedGraph.genome.100000
FILEPATH=test/${DATASET_NAME}
OUTPUT_DIR=${FILEPATH}.tiles; rsync -a --delete blank/ $OUTPUT_DIR; mkdir -p $OUTPUT_DIR; /usr/bin/time ${SPARK_HOME_DIR}/bin/spark-submit scripts/make_tiles.py -v value -c pos1,pos2,value --position pos1 --range pos1,pos2 --range-except-0 value -i value --resolution 1 --bins-per-dimension 64 --max-zoom 5 --use-spark ${FILEPATH} -o $OUTPUT_DIR

rsync -avzP $OUTPUT_DIR/ ~/projects/goomba/.tmp/jsons/${DATASET_NAME}

#aws s3 sync $OUTPUT_DIR s3://pkerp/data/served/${DATASET_NAME}.tiles

## Small file

OUTPUT_DIR=output
rsync -a --delete blank/ $OUTPUT_DIR; mkdir -p $OUTPUT_DIR;
python scripts/make_tiles.py -o $OUTPUT_DIR -v value -c chrom,pos1,pos2,value --range pos1,pos2 -i value test/data/smallBedGraph.tsv --delimiter ' ' --position pos1 --resolution 1 --max-zoom 14 --output-format dense --bins-per-dimension 128

### Real file

OUTPUT_DIR=~/data/ENCODE/2016-05-16-GM12878-RNASeq/tiles

/usr/bin/time spark-submit --driver-memory 8G scripts/make_tiles.py -o $OUTPUT_DIR -v value -c chrom,pos1,pos2,value --range pos1,pos2 -i value --position pos1 --resolution 1 --max-zoom 14 --output-format dense --bins-per-dimension 128 ~/data/ENCODE/2016-05-16-GM12878-RNASeq/ENCFF000FAA_chr1.bedGraph --use-spark
aws s3 sync --region us-west-2 $OUTPUT_DIR s3://pkerp/data/ENCODE/2016-05-16-GM12878-RNASeq/tiles

## BAM files

samtools view -h data/bam/GM12878_SRR1658581_10pc_3_R1_hg19.bwt2glob.bam | head -n 65536 | samtools view -Sb > data/bam/65536.bam

####

Turn off logging in log4j.properties. Place the log4j.properties file in ~/.spark-conf and point spark to that directory:

export SPARK_CONF_DIR=~/.spark-conf

#### Create ElasticSearch mapping

curl -XGET "http://127.0.0.1:9200/test_short/_optimize"
curl -XGET "http://127.0.0.1:9200/test_short/_mapping"
curl -XGET "http://127.0.0.1:9200/test_short/_stats"

curl -XDELETE "http://search-es4dn-z7rzz4kevtoyh5pfjkmjg5jsga.us-east-1.es.amazonaws.com/hg19/Dixon2015-H1hESC_ES-HindIII-allreps-filtered.1kb.genome.gz.mirrored.shuffled/"
curl -XGET "http://search-es4dn-z7rzz4kevtoyh5pfjkmjg5jsga.us-east-1.es.amazonaws.com/hg19/Dixon2015-H1hESC_ES-HindIII-allreps-filtered.1kb.genome.gz.mirrored.shuffled/14.21.12077"

curl -XGET "search-es4dn-z7rzz4kevtoyh5pfjkmjg5jsga.us-east-1.es.amazonaws.com/hg19/Dixon2015-H1hESC_ES-HindIII-allreps-filtered.1kb.genome.gz.mirrored.shuffled/_search" -d '
{
"query" : {
"match_all" : {}
}
}'

curl -XDELETE "http://127.0.0.1:9200/hg19"
curl -XPUT "localhost:9200/hg19" -d '

curl -XDELETE "http://search-higlass-ssxwuix6kow3sekyeresi7ay5e.us-east-1.es.amazonaws.com/hg19"
curl -XPUT "http://search-higlass-ssxwuix6kow3sekyeresi7ay5e.us-east-1.es.amazonaws.com/hg19" -d '
{
"mappings": {
"_default_": {
"dynamic_templates": [
{ "notanalyzed": {
"match": "*",
"mapping": {
"index": "no"
}
}
}
]
}
}
}'


#########################################################################################
#### Preparing test data
###########################################################################################

head -n 20212 ~/data/ENCODE/hg19/E116-DNase.fc.signal.bigwig.bedGraph.genome > test/sample_data/E116-DNase.fc.signal.bigwig.bedGraph.genome.20212
head -n 100000 ~/data/ENCODE/hg19/E116-DNase.fc.signal.bigwig.bedGraph.genome > test/sample_data/E116-DNase.fc.signal.bigwig.bedGraph.genome.100000

Project details


Release history Release notifications | RSS feed

Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

clodius-0.1.1.tar.gz (20.6 kB view hashes)

Uploaded Source

Supported by

AWS AWS Cloud computing and Security Sponsor Datadog Datadog Monitoring Fastly Fastly CDN Google Google Download Analytics Microsoft Microsoft PSF Sponsor Pingdom Pingdom Monitoring Sentry Sentry Error logging StatusPage StatusPage Status page