o
    Rŀgo[                      @   s  d Z ddlZddlmZ ddlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZ dd	l
mZ dd
l
mZ ddlmZ ddlmZ ddlmZ ddlmZ edde	dddde	dddde	dddde	dddde	dddde	ddd de	dd!d"de	dd#d$de	dd%d&de	d'd(d)de	d*d+d,de	d*d-d.de	dd/d0de	dd1d2de	dd3d4de	dd5d6de	d*d7d8de	d9d:d;de	dd<d=de	dd>d?de	dd@dAde	ddBdCde	ddDdEde	ddFdGde	ddHdIdgZG dJdK dKejZG dLdM dMejZdS )Nah  Bio.Align support for alignment files in the bigPsl format.

A bigPsl file is a bigBed file with a BED12+13 format consisting of the 12
predefined BED fields and 13 custom fields defined in the autoSql file
bigPsl.as. This module uses the Bio.Align.bigbed module to parse the file,
but stores the data in a PSL-consistent manner as defined in bigPsl.as. As the
bigPsl format is a special case of the bigBed format, bigPsl files are binary
and are indexed as bigBed files.

See http://genome.ucsc.edu/goldenPath/help/bigPsl.html for more information.

You are expected to use this module via the Bio.Align functions.
    N)	Alignment)
Alignments)bigbed)AutoSQLTable)Field)reverse_complement)Seq)UndefinedSequenceError)Location)
SeqFeature)_insdc_location_string)	SeqRecordbigPslzbigPsl pairwise alignmentstringchromz)Reference sequence chromosome or scaffold)as_typenamecommentuint
chromStartzStart position in chromosomechromEndzEnd position in chromosomer   z:Name or ID of item, ideally both human readable and uniquescorezScore (0-1000)zchar[1]strandzO+ or - indicates whether the query aligns to the + or - strand on the reference
thickStartz4Start of where display should be thick (start codon)thickEndz1End of where display should be thick (stop codon)reservedz*RGB value (use R,G,B string in input file)int
blockCountzNumber of blockszint[blockCount]
blockSizesz#Comma separated list of block sizeschromStartsz&Start positions relative to chromStartoChromStartz"Start position in other chromosome	oChromEndz End position in other chromosomeoStrandzE+ or -, - means that psl was reversed into BED-compatible coordinates
oChromSizezSize of other chromosome.oChromStartszZStart positions relative to oChromStart or from oChromStart+oChromSize depending on strandlstring	oSequencez0Sequence on other chrom (or edit list, or empty)oCDSzCDS in NCBI format	chromSizezSize of target chromosomematchzNumber of bases matched.misMatchz Number of bases that don't matchrepMatchz2Number of bases that match but are part of repeatsnCountzNumber of 'N' basesseqTypez#0=empty, 1=nucleotide, 2=amino_acidc                       s<   e Zd ZdZdZ							d fdd		Zd
d Z  ZS )AlignmentWriterz1Alignment file writer for the bigPsl file format.r   NT FNc	           	         s4   t  j|dt|||d || _|| _|| _|| _dS )a  Create an AlignmentWriter object.

        Arguments:
         - target      - output stream or file name.
         - targets     - A list of SeqRecord objects with the chromosomes in the
                         order as they appear in the alignments. The sequence
                         contents in each SeqRecord may be undefined, but the
                         sequence length must be defined, as in this example:

                         SeqRecord(Seq(None, length=248956422), id="chr1")

                         If targets is None (the default value), the alignments
                         must have an attribute .targets providing the list of
                         SeqRecord objects.
         - compress    - If True (default), compress data using zlib.
                         If False, do not compress data.
         - extraIndex  - List of strings with the names of extra columns to be
                         indexed.
                         Default value is an empty list.
         - cds         - If True, look for a query feature of type CDS and write
                         it in NCBI style in the PSL file (default: False).
         - fa          - If True, include the query sequence in the PSL file
                         (default: False).
         - mask        - Specify if repeat regions in the target sequence are
                         masked and should be reported in the `repMatches` field
                         instead of in the `matches` field.
                         Acceptable values are
                         None   : no masking (default);
                         "lower": masking by lower-case characters;
                         "upper": masking by upper-case characters.
         - wildcard    - Report alignments to the wildcard character in the
                         target or query sequence in the `nCount` field instead
                         of in the `matches`, `misMatches`, or `repMatches`
                         fields.
                         Default value is 'N'.
           )bedNdeclarationtargetscompress
extraIndexN)super__init__r3   cdsfamaskwildcard)	selftargetr4   r5   r6   r9   r:   r;   r<   	__class__r/   D/var/www/html/myenv/lib/python3.10/site-packages/Bio/Align/bigpsl.pyr8      s   /
zAlignmentWriter.__init__c           (   
   C   s  t  }| j}| j}|D ]@}t|tstd|j}|jsq|j\}}	z|	j	}	W n	 t
y1   Y nw z|j	}W n	 t
y@   Y nw t|}
t|	}d}|d |d krnd}t|	}	| }||dddf  |dddf< n'|d |d krd}t|}| }|
|d	ddf  |d	ddf< d
}nd}| j}| j}d	}d	}d	}d	}g }g }g }|ddd	f \}}|ddddf  D ]X\}}||kr|}q||kr|}q|| }|| }|| || || ||kr|d
usJ d}n|d| ksJ |dusJ d
}||| }|	|| }zt|}W n ty(   t|d}Y n ty3   d}Y nw zt|}W n tyI   t|d}Y n tyT   d}Y nw |du s_|du rd||7 }n|dkrt| | |D ]0\}} }!||ks| |kr|d7 }qs|| kr||!kr|d7 }qs|d7 }qs|d7 }qsno|dkrt| | |D ]0\}} }!||ks| |kr|d7 }q|| kr||!kr|d7 }q|d7 }q|d7 }qn.t| | D ]$\}} ||ks| |kr|d7 }q|| kr|d7 }q|d7 }q|}|}qt|}t|}t|}z|j}W n
 t
y7   Y nw z|j}W n
 t
yG   Y nw z|j}W n
 t
yW   Y nw z|j}W n
 t
yg   Y nw |d	 }|d | }d}"|dkr|d
u rd}"|||  }|ddd }|jdddddf |_n	|| || }}|d
u rt|jj	}#nd}#|d
u r|jjD ]}$|$j dkrt!|$j"t|j}% nqd}%nd}%d	}&|jj#$d}'|'dkrd}&n
|'dkrd}&nd}&t||j#d< t||j#d< |"|j#d< t||j#d< d%t&t||j#d < |#|j#d!< |%|j#d"< t|
|j#d#< t||j#d$< t||j#d%< t||j#d&< t||j#d'< |&|j#d(< || q|j'd)d* d+ |j(|_(t)j*|d,t+| j,d--| dS ).zWrite the file.zExpected an Alignment objectN   r   rC   -rC   r   r   r   rE   r   T+F   ASCIIlowerupperrE    CDSn/amolecule_typeDNA1protein20r    r!   r"   r#   ,r$   r&   r'   r(   r)   r*   r+   r,   r-   c                 S   s   | j j| jd fS )NrG   )r>   idcoordinates)	alignmentr/   r/   rA   <lambda>  s    z,AlignmentWriter.write_file.<locals>.<lambda>)keyr1   )r2   r3   r5   ).r   r9   r:   
isinstancer   	TypeErrorrY   size	sequencesseqAttributeErrorlenr   copyr<   r;   	transposeappendbytesr	   ziprM   rL   nparraymatches
misMatches
repMatchesr,   strqueryfeaturestyper   locationannotationsgetjoinmapsortr4   r   r.   r3   r5   write)(r=   stream
alignmentsfixed_alignmentsr9   r:   rZ   rY   r>   ro   tSizeqSizednaxr   r<   r;   rk   rl   rm   r,   r   qStartstStartstStartqStarttEndqEndtCountqCounttSeqqSequ1u2c1r"   r&   featurer'   r-   rQ   r/   r/   rA   
write_file   sn  




" "





























zAlignmentWriter.write_file)NTr/   FFNr0   )__name__
__module____qualname____doc__fmtr8   r   __classcell__r/   r/   r?   rA   r.      s    <r.   c                   @   s$   e Zd ZdZdZdd Zdd ZdS )AlignmentIteratorzAlignment iterator for bigPsl files.

    The pairwise alignments stored in the bigPsl file are loaded and returned
    incrementally.  Additional alignment information is stored as attributes
    of each alignment.
    r   c                 C   s>   d}t |D ]\}}||| jkrtd||| jf qd S )N)r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r&   r'   r(   r)   r*   r+   r,   r-   z$Expected field name '%s'; found '%s')	enumerater   
ValueError)r=   fields
fieldCountdefinedFieldCountnamesir   r/   r/   rA   _analyze_fields  s   z!AlignmentIterator._analyze_fieldsc           &      C   s>  ||d  dks
J |||d    d}t|dkr%tdt| | j| }t|d }	t||	kr@td|	t|f |d }
|d }t|d	 }t|d
 }dd |d ddD }dd |d ddD }dd |d ddD }t||krtdt||f t||krtdt||f t||krtdt||f t|}t|}t|}|d }|dkrt	d |d}nt	|}t||krtdt||f t
||d}|d }|r|dkrt|}t|dd}|j| |d }|d kr|}n"|d!krd"|jd#< |}n|d$kr.d%|jd#< |d& }ntd'| ||7 }|d( }|d)krs|
d)krs||}}|| | }|	| | }|d d d* }|d d d* }|d d d* }|d d d* }|d }|d }||gg}t||||D ]2\}}}} ||kr|||g |}| |kr||| g | }||7 }||7 }|||g qt| }t|d+ }!t|d, }"|
d)kr|d)kr|	|dd d f  |dd d f< n|"|!}!}"||dd d f  |dd d f< ||d- krtd.||d- f ||d/ krtd0||d/ f |!|d1 kr0td2|!|d1 f |"|d3 krAtd4|"|d3 f ||g}#t|#|}$i |$_|d }%zt|%}%W n
 tya   Y n
w |% rkt|%}%|%|$_t|d& |$_t|d5 |$_|d6 |$_t|d7 |$_t|d8 |$_t|d9 |$_t|d: |$_|$S );NrC   r   	   z-Unexpected number of fields (%d, expected 22)   z+Unexpected chromosome size %d (expected %d)   r1      c                 S      g | ]}t |qS r/   r   ).0	blockSizer/   r/   rA   
<listcomp>      z7AlignmentIterator._create_alignment.<locals>.<listcomp>   rW   c                 S   r   r/   r   r   startr/   r/   rA   r     r      c                 S   r   r/   r   r   r/   r/   rA   r     r      z5Inconsistent number of blocks (%d found, expected %d)zDInconsistent number of query start positions (%d found, expected %d)zEInconsistent number of target start positions (%d found, expected %d)   rN   )lengthz4Inconsistent query sequence length (%d, expected %d))rX      rP   rO   )rq      rV   rS   rR   rQ   rU   rT   rJ   zUnexpected sequence type '%s'   rF   rE   	   
   rG   z+Inconsistent tStart found (%d, expected %d)rH   z)Inconsistent tEnd found (%d, expected %d)rB   z+Inconsistent qStart found (%d, expected %d)rD   z)Inconsistent qEnd found (%d, expected %d)                  )decodesplitrc   r   r4   r   rstripri   rj   r   r   r
   
fromstringr   rp   rf   rs   rh   re   r   float
is_integerr   r   r   itemRgbrk   rl   rm   r,   )&r=   chromIdr   r   rest	dataStartdataEndwordstarget_recordr|   r   qNamer}   r   r   r   r   tBlockSizesquery_sequencequery_recordr9   rr   r   r-   qBlockSizesqStrand	qPosition	tPositionrY   tBqBtSqSr   r   recordsrZ   r   r/   r/   rA   _create_alignment  s  






















"
 




z#AlignmentIterator._create_alignmentN)r   r   r   r   r   r   r   r/   r/   r/   rA   r     s
    "r   )r   numpyri   	Bio.Alignr   r   r   Bio.Align.bigbedr   r   Bio.Seqr   r   r	   Bio.SeqFeaturer
   r   Bio.SeqIO.InsdcIOr   Bio.SeqRecordr   r3   r.   r   r/   r/   r/   rA   <module>   s*     