Skip to content

Commit 7f2f207

Browse files
committed
2 parents bac72c7 + 4bfa360 commit 7f2f207

File tree

2 files changed

+45
-23
lines changed

2 files changed

+45
-23
lines changed

RepeatMaskGenome.snakefile

Lines changed: 26 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@ configfile: "sd_analysis.json"
1010
if "asm" not in config and "assembly" in config:
1111
config["asm"] = config["assembly"]
1212

13-
asmFai=config["asm"] + ".fai"
13+
assembly="assembly.orig.fasta"
14+
asmFai="assembly.orig.fasta.fai"
15+
#asmFai=config["asm"] + ".fai"
1416
asmFaiFile=open(asmFai)
1517
contigs=[l.split()[0] for l in asmFaiFile]
1618
strIdx=[str(i) for i in range(0,len(contigs))]
@@ -27,7 +29,7 @@ rule all:
2729

2830
rule SplitContig:
2931
input:
30-
asm=config["asm"]
32+
asm=assembly
3133
output:
3234
contig="split/to_mask.{index}.fasta",
3335
params:
@@ -78,26 +80,31 @@ rule SpecialMaskContig:
7880
tmpdir=tempDir,
7981
sd=SD
8082
shell:"""
81-
mkdir -p t2t
82-
TEMP="$TMPDIR/$$_$RANDOM/"
83-
mkdir -p $TEMP
84-
#
85-
# Copy input file to temp dir that should have fast IO
86-
#
87-
{params.sd}/hardmask {input.mask} $TEMP/to_mask.{wildcards.index}.fasta && \
88-
pushd $TEMP && \
89-
RepeatMasker -nolow -libdir /home1/mchaisso/miniconda3/share/RepeatMasker/Libraries/Appended -species human -pa 8 -s -xsmall \"to_mask.{wildcards.index}.fasta\" && \
90-
popd && \
83+
if [ {params.repeatLibrary} != "na" ]
84+
then
85+
mkdir -p t2t
86+
TEMP="$TMPDIR/$$_$RANDOM/"
87+
mkdir -p $TEMP
88+
#
89+
# Copy input file to temp dir that should have fast IO
90+
#
91+
{params.sd}/hardmask {input.mask} $TEMP/to_mask.{wildcards.index}.fasta && \
92+
pushd $TEMP && \
93+
RepeatMasker -nolow -libdir /home1/mchaisso/miniconda3/share/RepeatMasker/Libraries/Appended -species human -pa 8 -s -xsmall \"to_mask.{wildcards.index}.fasta\" && \
94+
popd && \
9195
92-
if [ ! -e $TEMP/to_mask.\"{wildcards.index}\".fasta.masked ]; then
93-
ls -l masked/to_mask.\"{wildcards.index}\".fasta.masked
94-
cp masked/to_mask.\"{wildcards.index}\".fasta.masked t2t/to_mask.\"{wildcards.index}\".fasta.masked
95-
head -3 masked/to_mask.\"{wildcards.index}\".fasta.out > t2t/to_mask.\"{wildcards.index}\".fasta.out
96+
if [ ! -e $TEMP/to_mask.\"{wildcards.index}\".fasta.masked ]; then
97+
ls -l masked/to_mask.\"{wildcards.index}\".fasta.masked
98+
cp masked/to_mask.\"{wildcards.index}\".fasta.masked t2t/to_mask.\"{wildcards.index}\".fasta.masked
99+
head -3 masked/to_mask.\"{wildcards.index}\".fasta.out > t2t/to_mask.\"{wildcards.index}\".fasta.out
100+
else
101+
cp $TEMP/to_mask.\"{wildcards.index}\".fasta.* t2t/ || true
102+
fi
103+
rm -rf $TEMP
96104
else
97-
cp $TEMP/to_mask.\"{wildcards.index}\".fasta.* t2t/ || true
105+
echo "*** rule SpecialMaskContig made no change (skipped) ***" > t2t/to_mask.\"{wildcards.index}\".fasta.out
106+
cp {input.mask} t2t/to_mask.\"{wildcards.index}\".fasta.masked >> t2t/to_mask.\"{wildcards.index}\".fasta.out || true
98107
fi
99-
rm -rf $TEMP
100-
101108
"""
102109

103110
rule MergeMaskerRuns:

SegDupAnalysis.snakefile

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ pos=[]
3434
subs=["all", "high_ident"]
3535

3636

37-
localrules: all, AnnotateResolvedTandemDups, GetUniqueGencodeUnresolvedDupGenes, IntersectGenesWithFullSDList, FullDupToBed12, FullDupToLinks, MakeWMBed, MaskFile, ConvertHMMCopyNumberToCollapsedDuplications, SortSedef, FilterSedef, CountMaskedSedef, RemoveSedefTooMasked, MakeSedefGraph, MakeSedefGraphTable, FilterByGraphClusters, FullDupToBed12, FiltDupToBed12, GetUniqueGencodeUnresolvedDupGenesCN, GetUniqueGencodeUnresolvedDupGenes, GetGencodeMulticopy, GetGencodeMappedInDup, GetSupportedMulticopy,FindResolvedDuplicatedGenes, Bed12ToBed6, CombineGenesWithCollapsedDups, CombineDuplicatedGenes, MinimapGeneModelBed, FilterGencodeBed12, FindGenesInResolvedDups, SelectOneIsoform, SplitSplicedAndSingleExon, AnnotateLowCoverageFlanks, UnionMasked,GetNamedFasta, SelectDups, SortDups, GetDepthOverDups, FilterLowDepthDups, GetFullGeneCountTable, AddCollapsedGenes, GetCombinedTable, SelectDupsOneIsoform, GetFinalMerged, DupsPerContig, GetAllMultiGenes, AnnotateHighIdentity, GetTotalMasked, AnnotateResolvedTandemDups, GeneCountFact, GetFullGeneCountTable, FilterMultiExonBed, MappedSamIdentityDups, RemoveOriginal, RemoveBams, MakeSedefIntv, HighestIdentPairs, SelectHighIdent, GetCollapseByRange, GetCollapsedMask, GetCN
37+
localrules: all, AnnotateResolvedTandemDups, GetUniqueGencodeUnresolvedDupGenes, IntersectGenesWithFullSDList, FullDupToBed12, FullDupToLinks, MakeWMBed, MaskFile, ConvertHMMCopyNumberToCollapsedDuplications, SortSedef, FilterSedef, CountMaskedSedef, RemoveSedefTooMasked, MakeSedefGraph, MakeSedefGraphTable, FilterByGraphClusters, FullDupToBed12, FiltDupToBed12, GetUniqueGencodeUnresolvedDupGenesCN, GetUniqueGencodeUnresolvedDupGenes, GetGencodeMulticopy, GetGencodeMappedInDup, GetSupportedMulticopy,FindResolvedDuplicatedGenes, Bed12ToBed6, CombineGenesWithCollapsedDups, CombineDuplicatedGenes, MinimapGeneModelBed, FilterGencodeBed12, FindGenesInResolvedDups, SelectOneIsoform, SplitSplicedAndSingleExon, AnnotateLowCoverageFlanks, UnionMasked,GetNamedFasta, SelectDups, SortDups, GetDepthOverDups, FilterLowDepthDups, GetFullGeneCountTable, AddCollapsedGenes, GetCombinedTable, SelectDupsOneIsoform, GetFinalMerged, DupsPerContig, GetAllMultiGenes, AnnotateHighIdentity, GetTotalMasked, AnnotateResolvedTandemDups, GeneCountFact, GetGeneCountTableAbbreviatedNames, FilterMultiExonBed, MappedSamIdentityDups, RemoveOriginal, RemoveBams, MakeSedefIntv, HighestIdentPairs, SelectHighIdent, GetCollapseByRange, GetCollapsedMask, GetCN
3838

3939

4040

@@ -64,7 +64,8 @@ rule all:
6464
comb_with_depth="gencode.mapped.bam.bed12.multi_exon.fasta.named.mm2.dups.one_isoform.txt.combined.depth",
6565
fact="gencode.mapped.bam.bed12.multi_exon.fasta.named.mm2.dups.one_isoform.txt.combined.depth.filt.fact",
6666
gene_count="gencode.mapped.bam.bed12.multi_exon.fasta.named.mm2.dups.one_isoform.txt.combined.depth.filt.gene_count",
67-
gene_count_2column="gencode.mapped.bam.bed12.multi_exon.fasta.named.mm2.dups.one_isoform.txt.combined.depth.filt.gene_count_multi_single",
67+
gene_count_2column="gencode.mapped.bam.bed12.multi_exon.fasta.named.mm2.dups.one_isoform.txt.combined.depth.filt.gene_count_multi_single",
68+
gene_count_abbrvNames="gencode.mapped.bam.bed12.multi_exon.fasta.named.mm2.dups.one_isoform.txt.combined.depth.filt.gene_count.abbrv_names",
6869
asmMask=expand("{asm}.count_masked", asm=["assembly.orig.fasta", "assembly.masked.fasta", "assembly.repeat_masked.fasta", "assembly.union_masked.fasta"]),
6970
uniqueDupGenes="gencode.mapped.bam.bed12.dups.unique",
7071
sedef_high_uniq="sedef_out/all/final.sorted.bed.uniq.high",
@@ -1428,7 +1429,7 @@ rule MappedSamIdentity:
14281429
grid_opts=config["grid_small"],
14291430
sd=SD,
14301431
shell:"""
1431-
{params.sd}/hmcnc/HMM/samToBed {input.mappedsam} --reportAccuracy > {output.mappedsambed}
1432+
{params.sd}/hmcnc/src/samToBed {input.mappedsam} --reportAccuracy > {output.mappedsambed}
14321433
"""
14331434

14341435
rule AddDepthCopyNumber:
@@ -1466,7 +1467,8 @@ rule AnnotateOriginal:
14661467
output:
14671468
annot="gencode.mapped.bam.bed12.multi_exon.fasta.named.mm2.sam.bed.dups.annot_orig",
14681469
params:
1469-
sd=SD
1470+
sd=SD,
1471+
grid_opts=config["grid_medium"] # KEON
14701472
shell:"""
14711473
{params.sd}/AnnotateOriginal.py {input.mappedsambeddups} > {output.annot}
14721474
"""
@@ -1688,6 +1690,19 @@ cat {input.depth_filt} | awk '{{ if (NR == 1) {{ print "gene\\tresolved\\tcollap
16881690
cat {output.gene_count_2column} | awk '{{ if (NR == 1) {{ print "gene\\tcopies";}} else {{ print $1"\\t"$2+$3;}} }}' > {output.gene_count}
16891691
"""
16901692

1693+
rule GetGeneCountTableAbbreviatedNames:
1694+
input:
1695+
gene_count="gencode.mapped.bam.bed12.multi_exon.fasta.named.mm2.dups.one_isoform.txt.combined.depth.filt.gene_count",
1696+
output:
1697+
gene_count_abbrvNames="gencode.mapped.bam.bed12.multi_exon.fasta.named.mm2.dups.one_isoform.txt.combined.depth.filt.gene_count.abbrv_names",
1698+
params:
1699+
sd=SD
1700+
shell:"""
1701+
awk 'BEGIN {{ FS = "[ |\t]"}} \
1702+
NR == 1 {{print}} \
1703+
NR > 1 {{print $1,"\t" $3}}' {input.gene_count} > {output.gene_count_abbrvNames}
1704+
"""
1705+
16911706
rule cramBam:
16921707
input:
16931708
bam=config['bam'],

0 commit comments

Comments
 (0)