o
    Rŀge                     @   s   d Z ddlZddlmZ zddlmZ W n ey   dZY nw ddlmZ ddl	m
Z
 ddlmZ dd	lmZ d
ZG dd deZdddZG dd dZdS )aS  Bio.AlignIO support for the "maf" multiple alignment format.

The Multiple Alignment Format, described by UCSC, stores a series of
multiple alignments in a single file. It is suitable for whole-genome
to whole-genome alignments, metadata such as source chromosome, start
position, size, and strand can be stored.

See http://genome.ucsc.edu/FAQ/FAQformat.html#format5

You are expected to use this module via the Bio.AlignIO functions(or the
Bio.SeqIO functions if you want to work directly with the gapped sequences).

Coordinates in the MAF format are defined in terms of zero-based start
positions (like Python) and aligning region sizes.

A minimal aligned region of length one and starting at first position in the
source sequence would have ``start == 0`` and ``size == 1``.

As we can see on this example, ``start + size`` will give one more than the
zero-based end position. We can therefore manipulate ``start`` and
``start + size`` as python list slice boundaries.

For an inclusive end coordinate, we need to use ``end = start + size - 1``.
A 1-column wide alignment would have ``start == end``.
    N)islice)dbapi2)MultipleSeqAlignment)Seq)	SeqRecord   )SequentialAlignmentWriter   c                   @   s(   e Zd ZdZdd Zdd Zdd ZdS )		MafWriterz9Accepts a MultipleSeqAlignment object, writes a MAF file.c                 C   s   | j d | j d dS )zWrite the MAF header.z##maf version=1 scoring=none
z# generated by Biopython

N)handlewriteself r   E/var/www/html/myenv/lib/python3.10/site-packages/Bio/AlignIO/MafIO.pywrite_header8   s   zMafWriter.write_headerc                 C   s   |j ddkrd}n|j ddkrd}nd}dd|jdd	 d
|j dd d|j dtt|jdd |d
|j dd t|jg}| jd	| d dS )zHWrite a single SeqRecord object to an 's' line in a MAF block (PRIVATE).strandr   +-sz%-40s _z%15sstartr   z%5ssize srcSize
N)
annotationsgetidreplacelenstrseqr   r   join)r   recordr   fieldsr   r   r   _write_record=   s    zMafWriter._write_recordc                 C   s   t |ts	tdtdd |D dkrtdzddd |j D }W n ty2   d	}Y nw | j	
d
| d d}|D ]}| | |d7 }qA| j	
d |S )zWrite a complete alignment to a MAF block.

        Writes every SeqRecord in a MultipleSeqAlignment object to its own
        MAF block (beginning with an 'a' line, containing 's' lines).
        zExpected an alignment objectc                 S   s   h | ]}t |qS r   r"   ).0xr   r   r   	<setcomp>^   s    z,MafWriter.write_alignment.<locals>.<setcomp>r   z%Sequences must all be the same lengthr   c                 S   s&   g | ]\}}|d v r| d| qS ))scorepass=r   )r*   r+   yr   r   r   
<listcomp>h   s
    z-MafWriter.write_alignment.<locals>.<listcomp>z
score=0.00za r   r   )
isinstancer   	TypeErrorr"   
ValueErrorr%   _annotationsitemsAttributeErrorr   r   r(   )r   	alignmentannorecs_outr&   r   r   r   write_alignmentU   s(   


zMafWriter.write_alignmentN)__name__
__module____qualname____doc__r   r(   r;   r   r   r   r   r
   5   s
    r
   c              	   c   s    d}g }g }	 zt | }W n ty   d}Y nw |r|dr|  }t|dkr1td|d dkr:d	}n|d d
krCd}nd	}t|d t|d |t|d d}|d }	d|	v r|sftd|d j}
g }t	|	|
D ]\}}|
|dkr~|n| qrd|}	|
tt|	|d	 |d	 d|d nu|drno|drni|drnc|drn]| s|durt||ksJ t|}||_|V  d}g }g }n<td| |drd}|  d	d }t||dkrtdtdd |D }n|dr
n|sdS q) zIterate over a MAF file handle as MultipleSeqAlignment objects.

    Iterates over lines in a MAF file-like object (handle), yielding
    MultipleSeqAlignment objects. SeqRecord IDs generally correspond to
    species names.
    FTr   r      z5Error parsing alignment - 's' line must have 7 fields   r   r   r   r   r	         )r   r   r   r      .z/Found dot/period in first sequence of alignmentr   )r    namedescriptionr   ieq#Nz+Error parsing alignment - unexpected line:
ar/   z1Error parsing alignment - invalid key in 'a' linec                 s   s    | ]}| d V  qdS )r/   N)split)r*   a_stringr   r   r   	<genexpr>   s    zMafIterator.<locals>.<genexpr>)nextStopIteration
startswithstriprM   r"   r4   intr$   zipappendr%   r   r   r   r5   countdict)r   	seq_countin_a_bundler   recordsline
line_splitr   r9   sequencerefnewletter
ref_letterr8   annot_stringsr   r   r   MafIterator   s   






	


rd   c                   @   sz   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Ze	dd Z
e	dd Zdd Zdd ZdddZdd Zdd ZdS )MafIndexzIndex for a MAF file.

    The index is a sqlite3 database that is built upon creation of the object
    if necessary, and queried when methods *search* or *get_spliced* are
    used.
    c              
   C   s  t du rddlm} |d|| _|| _tjtj|| _	|| _
t| j
| _tj|rUt || _z|  | _W n@ tyT } z| j  | j  |dd}~ww t || _z|  | _W n ty{ } z| j  | j  |dd}~ww t| j| _dS )z)Indexes or loads the index of a MAF file.Nr   )MissingPythonDependencyErrorz.Python was compiled without the sqlite3 module)r   Biorf   _target_seqname_index_filenameospathabspathdirname_relative_path	_maf_fileopen_maf_fpisfileconnect_con_MafIndex__check_existing_db_record_countr4   close_MafIndex__make_new_indexrd   _mafiter)r   sqlite_filemaf_filetarget_seqnamerf   errr   r   r   __init__  s<   



zMafIndex.__init__c                 C   s   | j   d| _dS )a?  Close the file handle being used to read the data.

        Once called, further use of the index won't work. The sole
        purpose of this method is to allow explicit handle closure
        - for example if you wish to delete the file, on Windows
        you must first close all open handles to that file.
        r   N)rt   rw   rv   r   r   r   r   rw   .  s   

zMafIndex.closec           	   
   C   sl  zt | jd d }|tkr!dd| d| j g}t|| jd d }tj	
|r4|}ntj	| j|dtj	j}|tj	| jkrWtd| d	| j d
| jd d }|| jkrotd|| jf t | jd d }|dkrtdt | jd d }||krtd||f |W S  tjtjfy } ztd| dd}~ww )zEPerform basic sanity checks upon loading an existing index (PRIVATE).z1SELECT value FROM meta_data WHERE key = 'version'r   r   z=Index version (%s) incompatible with this version of MafIndexz;You might erase the existing index %s for it to be rebuilt.z2SELECT value FROM meta_data WHERE key = 'filename'/zIndex uses a different file (z != )z8SELECT value FROM meta_data WHERE key = 'target_seqname'z-Provided database indexed for %s, expected %sz6SELECT value FROM meta_data WHERE key = 'record_count'r   z$Unfinished/partial database providedz SELECT COUNT(*) FROM offset_dataz.Expected %s records, found %s.  Corrupt index?zProblem with SQLite database: N)rT   rt   executefetchoneMAFINDEX_VERSIONr%   ri   r4   rj   rk   isabsrn   r!   seprl   ro   rh   r   OperationalErrorDatabaseError)	r   idx_versionmsgfilenametmp_mafpath	db_targetrecord_countrecords_foundr}   r   r   r   __check_existing_db9  s   
zMafIndex.__check_existing_dbc                 C   s  | j d | j ddtf | j d | j dd| jf tj| js=tj| js=tj	| j| j
tjjd}n.tjtj| jtjj | j
tjj rdtj	| j| j
tjjd}ntj| j}| j dd|f | j d d	}|  }	 tt|d}|sn| j d| | j   |t|7 }q| j d | j d | j d | j d| d | j   |S )z2Read MAF file and generate SQLite index (PRIVATE).z.CREATE TABLE meta_data (key TEXT, value TEXT);z1INSERT INTO meta_data (key, value) VALUES (?, ?);versionz?INSERT INTO meta_data (key, value) VALUES ('record_count', -1);r|   r   r   zSCREATE TABLE offset_data (bin INTEGER, start INTEGER, end INTEGER, offset INTEGER);r   Td   zCINSERT INTO offset_data (bin, start, end, offset) VALUES (?,?,?,?);z9CREATE INDEX IF NOT EXISTS bin_index ON offset_data(bin);z=CREATE INDEX IF NOT EXISTS start_index ON offset_data(start);z9CREATE INDEX IF NOT EXISTS end_index ON offset_data(end);zUPDATE meta_data SET value = 'z' WHERE key = 'record_count')rt   r   r   rh   rj   rk   r   ro   ri   relpathrn   r!   r   rm   rl   rR   _MafIndex__maf_indexerlistr   executemanycommitr"   )r   mafpathinsert_countmafindex_funcbatchr   r   r   __make_new_index  sn   


zMafIndex.__make_new_indexc                 c   s   | j  }|r|dr|| j  t| }	 | j  }| r%|dr-td| jf |dr{|  }|d | jkr{t	|d }t	|d }|t|d 
d	d
krgtd|t|d 
d	d
f || d }| ||d |||fV  nq| j  }|sdS dS )zReturn index information for each bundle (PRIVATE).

        Yields index information for each bundle in the form of
        (bin, start, end, offset) tuples where start and end are
        0-based inclusive coordinates.
        rL   Tz1Target for indexing (%s) not found in this bundler   r   r	   rB   rD   r   r   z=Invalid length for target coordinates (expected %s, found %s)N)rq   readlinerR   tellr"   rS   r4   rh   rM   rT   r!   _ucscbin)r   r\   offsetr]   r   r   endr   r   r   __maf_indexer  s<   




zMafIndex.__maf_indexerc                 C   s   ddg}| td| d?  d|d d?   | td| d?  d|d d?   | td| d	?  d
|d d	?   | td| d?  d|d d?   t|S )zFind bins that a region may belong to (PRIVATE).

        Converts a region to a list of bins that it may belong to, including largest
        and smallest bins.
        r   r      r	   	      
   I      J   I     iJ  )extendrangeset)r   r   binsr   r   r   _region2bin  s   $$$$zMafIndex._region2binc                 C   s^   g d}d}d}| }|d }||L }||L }|D ]}||kr$||   S ||L }||L }qdS )zReturn the smallest bin a given region will fit into (PRIVATE).

        Adapted from http://genomewiki.ucsc.edu/index.php/Bin_indexing_system
        )r   r   r   r   r   r   rB   r   r   r   )r   r   bin_offsets_bin_first_shift_bin_next_shift	start_binend_bin
bin_offsetr   r   r   r     s   
zMafIndex._ucscbinc                 C   s   | j | t| jS )zFRetrieve a single MAF record located at the offset provided (PRIVATE).)rq   seekrP   ry   )r   r   r   r   r   _get_record+  s   
zMafIndex._get_recordc              
   c   sp   t |t |krtdt||D ]\}}|| }|dk r'td|||f q| j}t }t||D ]\}}zdtt| ||}W n t	yU   t	d||f dw |
d|||d |d f }	|	 }
|
D ]I\}}}||f|v rwqk|||f | t|}|D ])}|j| jkr|jd }||jd	  d }||kr||kstd
|||||f q|V  qkq3dS )a  Search index database for MAF records overlapping ranges provided.

        Returns *MultipleSeqAlignment* results in order by start, then end, then
        internal offset field.

        *starts* should be a list of 0-based start coordinates of segments in the reference.
        *ends* should be the list of the corresponding segment ends
        (in the half-open UCSC convention:
        http://genome.ucsc.edu/blog/the-ucsc-genome-browser-coordinate-counting-systems/).
        z2Every position in starts must have a match in endsr   z7Exon coordinates (%d, %d) invalid: exon length (%d) < 1z, z4Exon coordinates must be integers (start=%d, end=%d)NzSELECT DISTINCT start, end, offset FROM offset_data WHERE bin IN (%s) AND (end BETWEEN %s AND %s OR %s BETWEEN start AND end) ORDER BY start, end, offset ASC;r   r   z'Expected %s-%s @ offset %s, found %s-%s)r"   r4   rU   rt   r   r%   mapr#   r   r3   r   fetchalladdr   rT   r    rh   r   )r   startsends	exonstartexonendexonlenconyielded_rec_coordspossible_binsresultrows	rec_startrec_endr   fetchedr&   r   r   r   r   r   search0  sj   
zMafIndex.searchr   c               
   C   s  |dvrt d| t| ||}tdd t||D }t|dkr3tttd| | j	dgS dd	 |D }d
d |D }d}d}	|D ]}
|
D ]n}|j
| j	krz&|	du rf|jd }	|	dvret dn|	|jd krxt d|jd |	f W n ty   t d| j	 dw t|}|jd }|jd }|| d }||7 }|
D ]}t||d D ]	}d||j
 |< qq n
qKt d| j	 d|}t|D ],}|
D ]}|j
| j	kr|j| }||j
 |  |j| 7  < q|dkr||k r|d7 }qqGt|| j	 |krt d| j	t|| j	 |f dd || j	  D }i }|D ]Q}|| }g }|| j	kr0dnd}|j}t||D ].\}}t||D ]#}||v rR|||  qC||v ra||||   qC|| qCq:d|||< q t|| j	 dd|krt dt|| j	 dd| j	|f t|| j	 }| D ]\}}t||krt dt|||f qg }| D ]\}}t|}||	kr|n| }|t|||dd qt|S )a  Return a multiple alignment of the exact sequence range provided.

        Accepts two lists of start and end positions on target_seqname, representing
        exons to be spliced in silico.  Returns a *MultipleSeqAlignment* of the
        desired sequences spliced together.

        *starts* should be a list of 0-based start coordinates of segments in the reference.
        *ends* should be the list of the corresponding segment ends
        (in the half-open UCSC convention:
        http://genome.ucsc.edu/blog/the-ucsc-genome-browser-coordinate-counting-systems/).

        To ask for the alignment portion corresponding to the first 100
        nucleotides of the reference sequence, you would use
        ``search([0], [100])``
        )r   r   zStrand must be 1 or -1, got c                 s   s    | ]	\}}|| V  qd S Nr   )r*   r   r   r   r   r   rO     s    z'MafIndex.get_spliced.<locals>.<genexpr>r   Nr    c                 S   s   h | ]
}|D ]}|j qqS r   r   )r*   multiseqr^   r   r   r   r,     s    z'MafIndex.get_spliced.<locals>.<setcomp>c                 S   s   i | ]}|i qS r   r   )r*   seq_namer   r   r   
<dictcomp>  s    z(MafIndex.get_spliced.<locals>.<dictcomp>Nr   zStrand must be 1 or -1z8Encountered strand='%s' on target seqname, expected '%s'z-No strand information for target seqname (%s)r   r   r   r   zDid not find z in alignment bundler   z/Target seqname (%s) has %s records, expected %sc                 S   s&   i | ]\}}t |d kr|t |qS r   r)   )r*   posgapped_fragmentr   r   r   r     s
    z9Returning %s letters for target seqname (%s), expected %sz'Returning length %s for %s, expected %s)r    rF   rG   )r4   r   r   sumrU   r"   r   r   r   rh   r    r   KeyErrorr   r$   r6   rV   r%   r!   reverse_complement) r   r   r   r   r   expected_lettersall_seqnamessplit_by_positiontotal_rec_lengthref_first_strandr   seqrec
rec_lengthr   ungapped_lengthr   r   real_pos
gapped_pos	track_valrealpos_to_lensubseqseqid	seq_split
seq_splicefiller_charrV   r   r   ref_subseq_lenr$   result_multiseqr   r   r   get_spliced  s   	


$




zMafIndex.get_splicedc                 C   s   d| j j| jf S )z,Return a string representation of the index.z%MafIO.MafIndex(%r, target_seqname=%r))rq   rF   rh   r   r   r   r   __repr__V  s   zMafIndex.__repr__c                 C   s   | j S )z*Return the number of records in the index.)rv   r   r   r   r   __len__]  s   zMafIndex.__len__Nr   )r<   r=   r>   r?   r~   rw   ru   rx   r   staticmethodr   r   r   r   r   r   r   r   r   r   r   re      s"    )FR1


c Dre   r   )r?   rj   	itertoolsr   sqlite3r   ImportError	Bio.Alignr   Bio.Seqr   Bio.SeqRecordr   
Interfacesr   r   r
   rd   re   r   r   r   r   <module>   s    	
L|