#awk -f longread.awk aln.paf
#aln.paf: minimap2
#process long read alignment data for Lep-Anchor
BEGIN{
#mapping quality limit
if (qLimit == "")
qLimit=0
FS="\t"
rev["+"] = "-"
rev["-"] = "+"
}
(NF>=12 && $12>=qLimit){
for (i = 13; i<=NF;++i)
if ($i == "tp:A:S")
next #do not process secondary alignments
if ($1 == prevR) {
++n
} else {
processAllPairs()
n = 1
}
#print within contig part
print $6"\t"($8 + 1)"\t"$6"\t"$9"\t++\t"$12
print $6"\t"$9"\t"$6"\t"($8 + 1)"\t--\t"$12
o[n]=$5 #orientation
c[n]=$6 #contig
start[n]=$8+1 #start
end[n]=$9 #end
rstart[n]=$3+1 #read start
rend[n]=$4 #read end
q[n]=$12 #quality
prevR = $1
}
function min(a,b){
if (a <= b)
return a
else
return b
}
#find the shortest pair in read distance between each aligned region and contig
function processAllPairs( i,j,k,pos,count,rindex)
{
if (n <= 1)
return
if (n == 2) {
i = 1
j = 2
processOnePair(c[i],start[i],end[i],rstart[i],rend[i],o[i],c[j],start[j],end[j],rstart[j],rend[j],o[j],min(q[i], q[j]))
return
}
#sort reads based on rstart
for (i = 1; i <= n; ++i) {
pos[i] = rstart[i]
rindex[rstart[i], ++count[rstart[i]]] = i
}
asort(pos)
#process only adjacent reads (maybe could skip some?)
for (k = 1; k < n; ++k) {
i = rindex[pos[k], count[pos[k]]--]
j = rindex[pos[k + 1], count[pos[k + 1]]]
processOnePair(c[i],start[i],end[i],rstart[i],rend[i],o[i],c[j],start[j],end[j],rstart[j],rend[j],o[j],min(q[i], q[j]))
#printf pos[k]"->"pos[k+1] " " i " " j "\t"
}
#print ""
# for (i = 1; i <= n; ++i)
# for (j = i + 1; j <= n; ++j)
# processOnePair(c[j],start[j],end[j],rstart[j],rend[j],o[j],c[i],start[i],end[i],rstart[i],rend[i],o[i],min(q[i], q[j]))
}
function processOnePair(c1, start1, end1, rstart1, rend1, o1, c2, start2, end2, rstart2, rend2, o2, quality ,lp1,lp2,overlap,overlap1,overlap2)
{
# if (c1 == c2 && o1 != o2)
# return
if (rstart1 > rstart2) {
processOnePair(c2, start2, end2, rstart2, rend2, o2, c1, start1, end1, rstart1, rend1, o1, quality)
return;
}
#now rstart1 <= rstart2
# print "process"
overlap = 0
if (rend1 >= rstart2)
overlap = rend1 - rstart2 + 1
#overlap is too large
if (overlap > end1 - start1 + 1 && overlap > end2 - start2 + 1)
return
#remove thw overlap from one of the mappings with longer overhang...
if (end1 - start1 >= end2 - start2) {
overlap1 = overlap
overlap2 = 0
} else {
overlap1 = 0
overlap2 = overlap
}
if (o1 == "+")
lp1 = end1 - overlap1
else
lp1 = start1 + overlap1
if (o2 == "+")
lp2 = start2 + overlap2
else
lp2 = end2 - overlap2
#print both ways
print c1"\t"lp1"\t"c2"\t"lp2"\t"o1 o2"\t"quality
print c2"\t"lp2"\t"c1"\t"lp1"\t"rev[o2] rev[o1]"\t"quality
# print "end"
}
END{
processAllPairs()
}