o
    Rŀgf                     @   s   d Z ddlZddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ G dd	 d	ejZG d
d dejZedkrIddlmZ e  dS dS )a   Bio.Align support for alignment files in the Stockholm file format.

You are expected to use this module via the Bio.Align functions.

For example, consider this alignment from PFAM for the HAT helix motif::

    # STOCKHOLM 1.0
    #=GF ID   HAT
    #=GF AC   PF02184.18
    #=GF DE   HAT (Half-A-TPR) repeat
    #=GF AU   SMART;
    #=GF SE   Alignment kindly provided by SMART
    #=GF GA   21.00 21.00;
    #=GF TC   21.00 21.00;
    #=GF NC   20.90 20.90;
    #=GF BM   hmmbuild HMM.ann SEED.ann
    #=GF SM   hmmsearch -Z 57096847 -E 1000 --cpu 4 HMM pfamseq
    #=GF TP   Repeat
    #=GF CL   CL0020
    #=GF RN   [1]
    #=GF RM   9478129
    #=GF RT   The HAT helix, a repetitive motif implicated in RNA processing.
    #=GF RA   Preker PJ, Keller W;
    #=GF RL   Trends Biochem Sci 1998;23:15-16.
    #=GF DR   INTERPRO; IPR003107;
    #=GF DR   SMART; HAT;
    #=GF DR   SO; 0001068; polypeptide_repeat;
    #=GF CC   The HAT (Half A TPR) repeat is found in several RNA processing
    #=GF CC   proteins [1].
    #=GF SQ   3
    #=GS CRN_DROME/191-222     AC P17886.2
    #=GS CLF1_SCHPO/185-216    AC P87312.1
    #=GS CLF1_SCHPO/185-216    DR PDB; 3JB9 R; 185-216;
    #=GS O16376_CAEEL/201-233  AC O16376.2
    CRN_DROME/191-222                KEIDRAREIYERFVYVH.PDVKNWIKFARFEES
    CLF1_SCHPO/185-216               HENERARGIYERFVVVH.PEVTNWLRWARFEEE
    #=GR CLF1_SCHPO/185-216    SS    --HHHHHHHHHHHHHHS.--HHHHHHHHHHHHH
    O16376_CAEEL/201-233             KEIDRARSVYQRFLHVHGINVQNWIKYAKFEER
    #=GC SS_cons                     --HHHHHHHHHHHHHHS.--HHHHHHHHHHHHH
    #=GC seq_cons                    KEIDRARuIYERFVaVH.P-VpNWIKaARFEEc
    //

Parsing this file using Bio.Align stores the alignment, its annotations, as
well as the sequences and their annotations::

    >>> from Bio.Align import stockholm
    >>> alignments = stockholm.AlignmentIterator("Stockholm/example.sth")
    >>> alignment = next(alignments)
    >>> alignment.shape
    (3, 33)
    >>> alignment[0]
    'KEIDRAREIYERFVYVH-PDVKNWIKFARFEES'

Alignment meta-data are stored in alignment.annotations::

    >>> alignment.annotations["accession"]
    'PF02184.18'
    >>> alignment.annotations["references"][0]["title"]
    'The HAT helix, a repetitive motif implicated in RNA processing.'

Annotations of alignment columns are stored in alignment.column_annotations::

    >>> alignment.column_annotations["consensus secondary structure"]
    '--HHHHHHHHHHHHHHS.--HHHHHHHHHHHHH'

Sequences and their annotations are stored in alignment.sequences::

   >>> alignment.sequences[0].id
   'CRN_DROME/191-222'
   >>> alignment.sequences[0].seq
   Seq('KEIDRAREIYERFVYVHPDVKNWIKFARFEES')
   >>> alignment.sequences[1].letter_annotations["secondary structure"]
   '--HHHHHHHHHHHHHHS--HHHHHHHHHHHHH'

Slicing specific columns of an alignment will slice any per-column-annotations:

    >>> alignment.column_annotations["consensus secondary structure"]
    '--HHHHHHHHHHHHHHS.--HHHHHHHHHHHHH'
    >>> part_alignment = alignment[:,10:20]
    >>> part_alignment.column_annotations["consensus secondary structure"]
    'HHHHHHS.--'
    N)defaultdict)	Alignment)
interfaces)Seq)	SeqRecordc                   @   s6  e Zd ZdZdZi ddddddd	d
dddddddddddddddddddddd d!d"d#d$d%d%iZd&d'd(d)d*d+d,d-d.d/d0
Zd1d2d3d4d5d6d7d8Ze D ]\Z	Z
d9e
 ee	d: < q\d;D ]
Zed<d=ee< qkdd>d?d@dAZedBdC ZedDdE ZedFdG ZedHdI ZdJdK ZdLS )MAlignmentIteratora  Alignment iterator for alignment files in the Stockholm format.

    The file may contain multiple concatenated alignments, which are loaded
    and returned incrementally.

    Alignment meta-data (lines starting with #=GF) are stored in the dictionary
    alignment.annotations. Column annotations (lines starting with #=GC) are
    stored in the dictionary alignment.column_annotations. Sequence names are
    stored in record.id. Sequence record meta-data (lines starting with #=GS)
    are stored in the dictionary record.annotations. Sequence letter
    annotations (lines starting with #=GR) are stored in the dictionary
    record.letter_annotations.

    Wrap-around alignments are not supported - each sequence must be on
    a single line.

    For more information on the file format, please see:
    http://sonnhammer.sbc.su.se/Stockholm.html
    https://en.wikipedia.org/wiki/Stockholm_format
    	StockholmID
identifierAC	accessionDE
definitionAUauthorSEzsource of seedSSzsource of structureGAzgathering methodTCztrusted cutoffNCznoise cutoffBMzbuild methodSMzsearch methodTPtypePIzprevious identifierCCcommentCLclanWK	wikipediaCBzcalibration method**zsecondary structurezposterior probabilityzCatalytic Site Atlaszsurface accessibilitytransmembranezligand bindingzactive sitezactive site - Pfam predictedzactive site - from SwissProtintron)
r   PPCSASATMLIASpASsASINzreference coordinate annotationzconsensus sequencezconsensus scorezconsensus score 70zconsensus score 80zconsensus score 90z
model mask)RFseq_cons	scoreconsscorecons_70scorecons_80scorecons_90MMz
consensus _cons)RNA_elementsRNA_structural_elementRNA_structural_elementsRNA_ligand_AdoCblRNA_ligand_AqCblRNA_ligand_FMNRNA_ligand_GuanidiniumRNA_ligand_SAMRNA_ligand_THF_1RNA_ligand_THF_2RNA_ligand_TPPRNA_ligand_preQ1RNA_motif_k_turnRepeat_unit	2L3J_B_SSCOREPKPK_SScons_ organismzorganism classificationlook)r   OSOCLOc           	   	   C   s  |  D ]\}}|dkr;t|}g }|D ]#}d}|dr.||d d 7 }t|}|ds||7 }|| q|}n8|dv rEd|}n.|dkr^t|dksQJ t| |kr]t	d	q|d
krcnt|dksoJ ||f| }z
|| j
tj| < W q ty   Y qw d S )Nr    /)r   r   r"   rJ   SQ   z-Inconsistent number of sequences in alignmentr   )itemsiterendswithnextappendjoinlenintpop
ValueErrorannotationsr   
gf_mappingKeyError)		alignmentgfrowskeyvaluelines
referencesline	reference rk   G/var/www/html/myenv/lib/python3.10/site-packages/Bio/Align/stockholm.py_store_per_file_annotations   s<   

z-AlignmentIterator._store_per_file_annotationsc                    s   |r<i | _ | D ]4\}} rd fddt|D }t||kr1t| dt| d| || j tj||< q	d S d S )NrP   c                 3   s     | ]\}}| vr|V  qd S Nrk   .0indexletterskipped_columnsrk   rl   	<genexpr>   s    zBAlignmentIterator._store_per_column_annotations.<locals>.<genexpr>z length is z, expected )	column_annotationsrU   rZ   	enumerater[   r^   r   
gc_mappingget)rb   gccolumnsrt   re   rf   rk   rs   rl   _store_per_column_annotations   s"   z/AlignmentIterator._store_per_column_annotationsc                 C   s   |  D ];\}}| jD ]	}|j|kr nqtd| |  D ]\}}|dkr,||_q |dkr4||_q ||jtj	||< q qd S )NFailed to find seqname r   DR)
rU   	sequencesidr^   descriptiondbxrefsr_   r   
gs_mappingry   )rb   gsseqnamer_   recordre   rf   rk   rk   rl   _store_per_sequence_annotations  s   

z1AlignmentIterator._store_per_sequence_annotationsc                 C   s   |  D ]>\}}| jD ]	}|j|kr nqtd| |  D ]!\}}tj||}|dkr6|dd}n|dd}||j|< q qd S )Nr}   r&   -rP   .)	rU   r   r   r^   r   
gr_mappingry   replaceletter_annotations)rb   grr   r   r   re   letter_annotationfeaturerk   rk   rl   ._store_per_sequence_and_per_column_annotations  s   

z@AlignmentIterator._store_per_sequence_and_per_column_annotationsc           #   	   C   s  |D ]}|  }|sq|dkr/g }g }g }g }g }g }tt}	i }
tdd }tt}d }q|dkrtdd |D }t|tdkdd }t	||d	}d
d |D }t
|\}}t||D ]	\}}t||_qet
||}t|ddD ]}||= qz||_i |_|rg |jd< |D ])}t|}d|d |d< d|d |d< d|d |d< |jd | q|r||jd< |r||jd< |j\}}t||	| t||
|| t|| t|| |  S |dsz
|d d	\}}W n ty
   td| d w |d u rt|}td| }n|t|kr2td| dt| d| dt|D ]3\}}|dkrR|| tdksJJ td||< q6|dkrh|| tdksbJ td||< q6| dd}|| t!d |dd}|| q|d rX|d!d    d d	\}}|d"kr|d#sJ |"d$sJ t#|d	d% }tt}||d&< |rd||d'< g }|| q|d(kr|d) rJ ||d)< q|d*kr|d | q|d+kr|d | q|d,kr|d | q|d-kr|| q|d.krd/|i} ||  q|d0kr0d'| vs+J || d'< q|d1kr?d2|i}!||! q|d3krPd|!vsKJ ||!d< q|	| | q|d4r|d!d    d d5\}}||
vrud|
|< |
|  |  7  < q|d6rz|d!d    d d5\}}}W n ty   |d!d    d d	\}}d}Y nw |d.kr|| | | q||| vsJ ||| |< q|d d! d7kr|d!d  d d5}"|"d |ksJ |"d	 }|"d5   || |< qd S )8Nz# STOCKHOLM 1.0c                   S   s   dg iS )Nr~   rk   rk   rk   rk   rl   <lambda>9  s    z8AlignmentIterator._read_next_alignment.<locals>.<lambda>z//c                 S   s   g | ]}t | t jqS rk   )np
frombufferencodeint8rp   rowrk   rk   rl   
<listcomp>?  s    z:AlignmentIterator._read_next_alignment.<locals>.<listcomp>r   r   rT   c                 S   s   g | ]}|  qS rk   )tobytesr   rk   rk   rl   r   C  s    T)reverserh   rJ   titler   locationdatabase referencesnested domains#z>Could not split line into sequence name and aligned sequence:
   MzAligned sequence z consists of z letters, expected z	 letters)IDr   rP   )r   r   #=GF    RN[]rR   numberr   RMmedlineRTRARLRCr~   rj   DCNEr   NL#=GC    #=GS #=GR )$stripr   listdictr   arraynonzeroordalldeleter   parse_printed_alignmentzipr   seqsorted
operationsr_   rZ   rY   shaper   rm   r|   r   r   
startswithsplitr^   r[   	bytearrayrw   r   r   rW   r\   )#selfstreamri   recordsaligned_sequencesrh   reference_commentsdatabase_referencesnested_domainsrc   rz   r   r   lengthrt   r   coordinatessequencer   rb   rq   r   rj   rd   r{   r   aligned_sequenceirr   r   textr   database_referencenested_domaintermsrk   rk   rl   _read_next_alignment*  s$  























"
 z&AlignmentIterator._read_next_alignmentN)__name__
__module____qualname____doc__fmtr`   r   rx   rU   re   rf   keywordr   r   staticmethodrm   r|   r   r   r   rk   rk   rk   rl   r   f   s    	




r   c                   @   s   e Zd ZdZdd ej D Zdd ej D Zdd ej D Zdd ej	 D Z	dZ
dd	 Zed
d Zedd ZdS )AlignmentWriterz4Alignment file writer for the Stockholm file format.c                 C      i | ]\}}||qS rk   rk   rp   re   rf   rk   rk   rl   
<dictcomp>      zAlignmentWriter.<dictcomp>c                 C   r   rk   rk   r   rk   rk   rl   r     r   c                 C   r   rk   rk   r   rk   rk   rl   r     r   c                 C   r   rk   rk   r   rk   rk   rl   r     r   r   c                 C   s  |j \}}|dkrtd|dkrtdz|j}W n ty%   i }Y nw g }|d | j D ]9\}}|dkr;q2||}|durk| j| }|dv r_|D ]}	|d| d	|	 d
 qOq2|d| d	| d
 q2|d}
|
dur|
D ]&}|d}|dur|d| d
 |d}|dur|d| d
 qw|d}|dur|D ]I}|d}|t	d| |d|d  d |d|d  d
 |d }|t	d| |d|d  d
 |d|d  d
 q|d}|dur |D ] }|d|d  d
 |d}|dur|d| d
 qd}||}|dur<d | j|  }|t	|| |D ]$}|| jv rHq>|dkrOq>|dkrVq>|dkr]q>td!| |d"|  t
d#d$ |jD }t
|d%d& }|jD ]N}|j|}|j D ]\}}| j||}|d'| d(| d)| d
 q|jr|d'| d*|j d
 |jD ]}|d'| d+| d
 qq~z|j}W n ty   td,| }Y n
w t||ksJ t||jD ]\}}d-d.d$ t||D }|t|||| q|jr8|j D ]\}}| j||}d/| d)|| d
 }|| q|d0 d-|S )1z@Return a string with a single alignment in the Stockholm format.r   zMust have at least one sequencez Non-empty sequences are requiredz# STOCKHOLM 1.0
r   N)r   r    r   z   
r   r   z
#=GF NE   r   z
#=GF NL   rh   z
#=GF RC   z#=GF RN   [r   z]
z
#=GF RM   r   r   z
#=GF RT   z
#=GF RA   r   z
#=GF RL   r   z
#=GF DR   rj   z
#=GF DC   z
#=GF %s   z4Unknown annotation %s found in alignment.annotationsz#=GF SQ   %i
c                 s   s    | ]}t |jV  qd S rn   )r[   r   )rp   r   rk   rk   rl   ru   !  s    z3AlignmentWriter.format_alignment.<locals>.<genexpr>      r     rJ   z  DE z  DR r   rP   c                 s   s0    | ]\}}|d kr|t dkrdn|V  qdS )r   r   r   N)r   )rp   	operationrr   rk   rk   rl   ru   4  s
    
r   z//
)r   r^   r_   AttributeErrorrY   r`   rU   ry   r   _format_long_textmaxr   r   ljustr   r   r   r   bytesr[   r   rZ   extend_format_recordrv   rx   )r   rb   rd   r{   alignment_annotationsrg   re   r   rf   itemr   r   r   r   rh   rj   r   r   r   r   prefixwidthstartr   namer   r   ri   rk   rk   rl   format_alignment  s   



















"




z AlignmentWriter.format_alignmentc                 C   s$   |du rdS t j|dd| | dd S )z+Format the text as wrapped lines (PRIVATE).NrP   O   F)r   break_long_wordsinitial_indentsubsequent_indentr   )textwrapfill)r   r   rk   rk   rl   r   E  s   z!AlignmentWriter._format_long_textc                 c   s    |j |}|| d }|V  dd t|D }|  |j | }|j D ]E\}}tj||}	d}
t	dt
| }t|D ]\}}|dvrUt||
 ||< |
d7 }
qA| }d| d	|	 d
|| d }|V  q(dS )z.Format lines for a single SeqRecord (PRIVATE).r   c                 S   s   g | ]
\}}|d v r|qS ).-rk   ro   rk   rk   rl   r   [  s    z2AlignmentWriter._format_record.<locals>.<listcomp>r      .r   rT   r   r   rJ   N)r   r   rw   r   r   rU   r   r   ry   r   r[   r   decode)r   r   r   r   r   ri   indicesre   rf   r   jvaluesr   rr   rk   rk   rl   r   U  s,    zAlignmentWriter._format_recordN)r   r   r   r   r   r`   rU   r   r   rx   r   r   r   r   r   rk   rk   rk   rl   r     s    q
r   __main__)run_doctest)r   r   collectionsr   numpyr   	Bio.Alignr   r   Bio.Seqr   Bio.SeqRecordr   r   r   r   
Bio._utilsr  rk   rk   rk   rl   <module>   s"   S  f $
