o
    Rŀgwk                     @   s   d Z ddlZddlmZ ddlmZ ddlmZ zddlZW n e	y0   ddl
mZ eddw ddl
mZ dd	lmZ dd
lmZ dddZdddZdddZG dd deZG dd dZdd ZedkrvddlmZ edd dS dS )a>  Tools for sequence motif analysis.

Bio.motifs contains the core Motif class containing various I/O methods
as well as methods for motif comparisons and motif searching in sequences.
It also includes functionality for parsing output from the AlignACE, MEME,
and MAST programs, as well as files in the TRANSFAC format.
    N)	urlencode)Request)urlopen)MissingPythonDependencyErrorz,Install NumPy if you want to use Bio.motifs.)BiopythonDeprecationWarning)	Alignment)reverse_complementACGTc                 C   s   t | }t||dS )zCreate a Motif object.)	alignmentalphabet)r   Motif)	instancesr   r
    r   G/var/www/html/myenv/lib/python3.10/site-packages/Bio/motifs/__init__.pycreate%   s   r   Tc                 C   s(  |  }|dkrddlm} || S |dkr"ddlm} || S |dkr1ddlm} || S |dkr@dd	lm} || S |d
v rPddlm} || |S |dkr_ddlm} || S |dkrnddlm	}	 |	| S |dkr~ddlm
}
 |
| |S |dv rddlm} || |S td| )a  Parse an output file from a motif finding program.

    Currently supported formats (case is ignored):
     - AlignAce:         AlignAce output file format
     - ClusterBuster:    Cluster Buster position frequency matrix format
     - XMS:              XMS matrix format
     - MEME:             MEME output file motif
     - MINIMAL:          MINIMAL MEME output file motif
     - MAST:             MAST output file motif
     - TRANSFAC:         TRANSFAC database file format
     - pfm-four-columns: Generic position-frequency matrix format with four columns. (cisbp, homer, hocomoco, neph, tiffin)
     - pfm-four-rows:    Generic position-frequency matrix format with four row. (scertf, yetfasco, hdpi, idmmpmm, flyfactor survey)
     - pfm:              JASPAR-style position-frequency matrix
     - jaspar:           JASPAR-style multiple PFM format
     - sites:            JASPAR-style sites file

    As files in the pfm and sites formats contain only a single motif,
    it is easier to use Bio.motifs.read() instead of Bio.motifs.parse()
    for those.

    For example:

    >>> from Bio import motifs
    >>> with open("motifs/alignace.out") as handle:
    ...     for m in motifs.parse(handle, "AlignAce"):
    ...         print(m.consensus)
    ...
    TCTACGATTGAG
    CTGCACCTAGCTACGAGTGAG
    GTGCCCTAAGCATACTAGGCG
    GCCACTAGCAGAGCAGGGGGC
    CGACTCAGAGGTT
    CCACGCTAAGAGAAGTGCCGGAG
    GCACGTCCCTGAGCA
    GTCCATCGCAAAGCGTGGGGC
    GAGATCAGAGGGCCG
    TGGACGCGGGG
    GACCAGAGCCTCGCATGGGGG
    AGCGCGCGTG
    GCCGGTTGCTGTTCATTAGG
    ACCGACGGCAGCTAAAAGGG
    GACGCCGGGGAT
    CGACTCGCGCTTACAAGG

    If strict is True (default), the parser will raise a ValueError if the
    file contents does not strictly comply with the specified file format.
    alignacer   )r   meme)r   minimal)r   clusterbusterr   )zpfm-four-columnszpfm-four-rows)pfmxms)r   mast)r   transfacr   )r   sitesjasparr   zUnknown format %s)lower
Bio.motifsr   readr   r   r   r   r   r   r   r   
ValueError)handlefmtstrictr   r   r   r   r   r   r   r   r   r   r   r   parse+   s:   0





r%   c                 C   sH   |  }t| ||}t|dkrtdt|dkrtd|d }|S )a;  Read a motif from a handle using the specified file-format.

    This supports the same formats as Bio.motifs.parse(), but
    only for files containing exactly one motif.  For example,
    reading a JASPAR-style pfm file:

    >>> from Bio import motifs
    >>> with open("motifs/SRF.pfm") as handle:
    ...     m = motifs.read(handle, "pfm")
    >>> m.consensus
    Seq('GCCCATATATGG')

    Or a single-motif MEME file,

    >>> from Bio import motifs
    >>> with open("motifs/meme.psp_test.classic.zoops.xml") as handle:
    ...     m = motifs.read(handle, "meme")
    >>> m.consensus
    Seq('GCTTATGTAA')

    If the handle contains no records, or more than one record,
    an exception is raised:

    >>> from Bio import motifs
    >>> with open("motifs/alignace.out") as handle:
    ...     motif = motifs.read(handle, "AlignAce")
    Traceback (most recent call last):
        ...
    ValueError: More than one motif found in handle

    If however you want the first motif from a file containing
    multiple motifs this function would raise an exception (as
    shown in the example above).  Instead use:

    >>> from Bio import motifs
    >>> with open("motifs/alignace.out") as handle:
    ...     record = motifs.parse(handle, "alignace")
    >>> motif = record[0]
    >>> motif.consensus
    Seq('TCTACGATTGAG')

    Use the Bio.motifs.parse(handle, fmt) function if you want
    to read multiple records from the handle.

    If strict is True (default), the parser will raise a ValueError if the
    file contents does not strictly comply with the specified file format.
    r   zNo motifs found in handle   z#More than one motif found in handle)r   r%   lenr!   )r"   r#   r$   motifsmotifr   r   r   r       s   0r    c                   @   s:   e Zd ZdZdddZdd Zdd	 Zd
d Zdd ZdS )	Instancesz:Class containing a list of sequences that made the motifs.Nr	   c           	      C   s   ddl m} ddl m} tdt t|||tfrtdd}|dur[g }|D ]-}|du r3t	|}n|t	|krEdt	||f }t
|t||sP|t|}|| q(| | || _|| _dS )Initialize the class.r   
MutableSeqSeqa  The Instances class has been deprecated; please use the
Alignment class in Bio.Align instead.
To create a Motif instance, instead of
>>> from Bio.motifs import Instances
>>> instances = Instances([Seq('ACGT'), Seq('ACCT'), Seq('AAGT')])
>>> motif = Motif(alphabet='ACGT', instances=instances)

please use

>>> from Bio.Align import Alignment
>>> alignment = Alignment([Seq('ACGT'), Seq('ACCT'), Seq('AAGT')])
>>> motif = Motif(alphabet='ACGT', alignment=alignment)
zinstances should be iterator of Seq objects or strings. If a single sequence is given, will treat each character as a separate sequence.NzAAll instances should have the same length (%d found, %d expected))Bio.Seqr-   r/   warningswarnr   
isinstancestr	TypeErrorr'   r!   appendextendlengthr   )	selfr   r   r-   r/   r8   	sequencesinstancemessager   r   r   __init__   s8   




zInstances.__init__c                 C   s"   d}| D ]
}|t |d 7 }q|S )z6Return a string containing the sequences of the motif. 
r4   )r9   textr;   r   r   r   __str__   s   zInstances.__str__c                 C   sT   i }| j D ]
}dg| j ||< q| D ]}t|D ]\}}|| |  d7  < qq|S )z Count nucleotides in a position.r   r&   )r   r8   	enumerate)r9   countsletterr;   positionr   r   r   count   s   
zInstances.countc                 c   s\    t dt tt|| j d D ]}| D ]}||||| j  kr*||fV   nqqdS )zFind positions of motifs in a given sequence.

        This is a generator function, returning found positions of motif
        instances in a given sequence.
        z~instances.search(sequence) has been deprecated. Please use sequence.search(instances) instead, where sequence is a Seq object.r&   N)r1   r2   r   ranger'   r8   )r9   sequenceposr;   r   r   r   search  s   
zInstances.searchc                 C   s   ddl m} ddl m} ddlm} t| jd}| j|_| D ]&}t||||fr-|	 }nt|t
r7t	|}ntdt| || q|S )z(Compute reverse complement of sequences.r   r,   r.   )	SeqRecord)r   zinstance has unexpected type %s)r0   r-   r/   Bio.SeqRecordrL   r*   r   r8   r3   r   r4   RuntimeErrortyper6   )r9   r-   r/   rL   r   r;   r   r   r   r     s   


zInstances.reverse_complement)Nr	   )	__name__
__module____qualname____doc__r=   rB   rG   rK   r   r   r   r   r   r*      s    
0
r*   c                   @   s  e Zd ZdZd0ddZdd Zdd	 ZeeeZ[[d
d Z	dd Z
ee	e
Z[	[
dd Zdd ZeeeZ[[dd Zedd Zedd Zedd Zd1ddZdd Zdd  Zed!d" Zed#d$ Zed%d& Zed'd( Zd2d*d+Zd,d- Zd.d/ ZdS )3r   z%A class representing sequence motifs.r	   Nc           	      C   s4  ddl m} d| _|dur|durttdt|tr|}d}|dur9tdt	 |dur2ttdt
|}|j}|durF|durFttd|durZd| _|||| _| jj| _n2|dur|j}|j}|D ]}||vrtt|t||< qf|||| _|| _|| _n	d| _d| _d| _|| _d| _d| _d| _dS )	r+   r&   )matrixr>   Nz9Specify either alignment or instances, don't specify bothaS  The instances argument has been deprecated.
Instead of
>>> instances = [Seq('ACGT'), Seq('ACCT'), Seq('AAGT')]
>>> motif = Motif(alphabet='ACGT', instances=instances)

please use

>>> from Bio.Align import Alignment
>>> alignment = Alignment([Seq('ACGT'), Seq('ACCT'), Seq('AAGT')])
>>> motif = Motif(alphabet='ACGT', alignment=alignment)
z6Specify either counts or instances, don't specify bothz9Specify either counts or an alignment, don't specify both)r>   rT   name	Exceptionr!   r3   r*   r1   r2   r   r   r   r
   FrequencyPositionMatrixrD   r8   frequenciesnpzerosintpseudocounts
backgroundmask)	r9   r   r
   rD   r   rT   r8   rX   rE   r   r   r   r=   (  sZ   

zMotif.__init__c                 C      | j S N)_Motif__maskr9   r   r   r   
__get_maskc     zMotif.__get_maskc                 C   s   | j d u r
d| _d S |d u rd| j  | _d S t|| j kr(tdt|| j f t|trXg | _|D ]}|dkr?| jd q2|dkrJ| jd q2td| t| j| _d S td	d
 |D | _d S )Nr   )r&   zMThe length (%d) of the mask is inconsistent with the length (%d) of the motif*r&    r   z2Mask should contain only '*' or ' ' and not a '%s'c                 s   s    | ]	}t t|V  qd S r`   )r[   bool.0cr   r   r   	<genexpr>}  s    z#Motif.__set_mask.<locals>.<genexpr>)r8   ra   r'   r!   r3   r4   r6   tuple)r9   r^   charr   r   r   
__set_maskf  s,   


zMotif.__set_maskc                 C   r_   r`   )_pseudocountsrb   r   r   r   __get_pseudocounts  rd   zMotif.__get_pseudocountsc                    sJ   i | _ t tr fdd| jD | _ d S  d u rd t| j | _ d S )Nc                       i | ]}| | qS r   r   ri   rE   valuer   r   
<dictcomp>      z,Motif.__set_pseudocounts.<locals>.<dictcomp>g        )ro   r3   dictr   fromkeys)r9   rt   r   rs   r   __set_pseudocounts  s   
zMotif.__set_pseudocountsc                 C   r_   r`   )_backgroundrb   r   r   r   __get_background  rd   zMotif.__get_backgroundc                    s   t  tr fdd| jD | _n: d u rt| jd| _n-t| jg dkr+tdd  d | jd<  d | jd<  d | jd	< d  d | jd
< t| j }| jD ]}| j|  |  < qUd S )Nc                    rq   r   r   rr   rs   r   r   ru     rv   z*Motif.__set_background.<locals>.<dictcomp>g      ?ACGTz}Setting the background to a single value only works for DNA motifs (in which case the value is interpreted as the GC content)g       @r}   r~   r   r   )	r3   rw   r   rz   rx   sortedr!   sumvalues)r9   rt   totalrE   r   rs   r   __set_background  s    

zMotif.__set_backgroundc                    s   t  ts	tdj}jdu r&d}jdu rd}n fdd|D }njdd f }d}t|||d}j  |_|du r_|du r_zj}W n	 t	yT   Y nw t
t | |_j |_j |_|S )aB  Return a new Motif object for the positions included in key.

        >>> from Bio import motifs
        >>> motif = motifs.create(["AACGCCA", "ACCGCCC", "AACTCCG"])
        >>> print(motif)
        AACGCCA
        ACCGCCC
        AACTCCG
        >>> print(motif[:-1])
        AACGCC
        ACCGCC
        AACTCC
        zmotif indices must be slicesNc                    s   i | ]
}|j |   qS r   )rD   rr   keyr9   r   r   ru     s    z%Motif.__getitem__.<locals>.<dictcomp>)r   r
   rD   )r3   slicer5   r   r
   rD   r   r^   r8   AttributeErrorr'   rH   indicesr\   copyr]   )r9   r   r   r
   rD   r)   r8   r   r   r   __getitem__  s,   



zMotif.__getitem__c                 C      | j | jS )z?Calculate and return the position weight matrix for this motif.)rD   	normalizero   rb   r   r   r   pwm     z	Motif.pwmc                 C   r   )zICalculate and return the position specific scoring matrix for this motif.)r   log_oddsrz   rb   r   r   r   pssm  r   z
Motif.pssmc                 C   s"   t dt | jdu rdS | jjS )z4Return the sequences from which the motif was built.zrThe instances attribute has been deprecated. Instead of mymotif.instances, please use mymotif.alignment.sequences.N)r1   r2   r   r
   r:   rb   r   r   r   r     s   
zMotif.instancesFc                 C   sZ   d}| j dur|d| j 7 }|r+t| jD ]}| j| r"|d7 }q|d7 }q|d7 }|S )z(Return string representation of a motif.r>   Nr?   re   rf   )r
   joinrH   r8   ra   )r9   maskedrA   ir   r   r   rB     s   



zMotif.__str__c                 C   s   | j du rdS | j S )zReturn the length of a motif.

        Please use this method (i.e. invoke len(m)) instead of referring to m.length directly.
        Nr   )r8   rb   r   r   r   __len__  s   
zMotif.__len__c                 C   s   | j }| jdur| j }t||d}n-| jd ddd | jd ddd | jd ddd | jd ddd d}t||d	}| jddd |_| jd | jd | jd | jd d|_| jd | jd | jd | jd d|_|S )
z:Return the reverse complement of the motif as a new motif.N)r   r
   r   r   r~   r}   r|   )r   rD   )r   r
   r   r   rD   ra   r]   r\   )r9   r   r
   resrD   r   r   r   r     s,   

zMotif.reverse_complementc                 C      | j jS )zReturn the consensus sequence.)rD   	consensusrb   r   r   r   r        zMotif.consensusc                 C   r   )zBReturn the least probable pattern to be generated from this motif.)rD   anticonsensusrb   r   r   r   r   $  r   zMotif.anticonsensusc                 C   r   )aK  Return the degenerate consensus sequence.

        Following the rules adapted from
        D. R. Cavener: "Comparison of the consensus sequence flanking
        translational start sites in Drosophila and vertebrates."
        Nucleic Acids Research 15(4): 1353-1361. (1987).

        The same rules are used by TRANSFAC.
        )rD   degenerate_consensusrb   r   r   r   r   )  s   zMotif.degenerate_consensusc              
      sH  | j }| j| j | j| j}t|}| jdu rZt fddt	|D }
 D ]*\}}t||  }|dk}|| ||  }||  |t|||   7  < q-|S t|}
 D ]\}}|t||  7 }qc
 D ]*\}}t||  }|dk}|| ||  }||  |t|||   7  < qw|S )zGReturn an array with the relative entropy for each column of the motif.Nc                    s&   g | ] t  fd dD qS )c                 3   s$    | ]} |  |  V  qd S r`   r   rh   )rD   r   r\   r   r   rk   B  s   " z4Motif.relative_entropy.<locals>.<listcomp>.<genexpr>)r   )ri   r   rD   r\   )r   r   
<listcomp>A  s    z*Motif.relative_entropy.<locals>.<listcomp>r   )r]   r\   r   rD   r8   rY   rZ   r
   arrayrH   itemslog2)r9   r]   r8   r   r   rE   rX   r^   r   r   r   relative_entropy6  s6   

$

$zMotif.relative_entropyPNGc                 K   s  |dur
t dt t| jtdkrd}nt| jtdkr"d}nt| jtdkr.d}nd	}t| d
}d}i d|d| ddddd|dddddddddt| jddddddd dd!dd"dd#di d$dd%dd&dd'd	d(d)d*dd+dd,d-d.dd/dd0dd1dd2dd3dd4dd5dd6dd7di}|	d8d9 |
 D  t|d:}	t||	}
t|
}t|d;}| }|| W d   dS 1 sw   Y  dS )<a+  Download and save a weblogo using the Berkeley weblogo service.

        Requires an internet connection.

        The version parameter is deprecated and has no effect.

        The parameters from ``**kwds`` are passed directly to the weblogo server.

        Currently, this method uses WebLogo version 3.3.
        These are the arguments and their default values passed to
        WebLogo 3.3; see their website at http://weblogo.threeplusone.com
        for more information::

            'stack_width' : 'medium',
            'stacks_per_line' : '40',
            'alphabet' : 'alphabet_dna',
            'ignore_lower_case' : True,
            'unit_name' : "bits",
            'first_index' : '1',
            'logo_start' : '1',
            'logo_end': str(self.length),
            'composition' : "comp_auto",
            'percentCG' : '',
            'scale_width' : True,
            'show_errorbars' : True,
            'logo_title' : '',
            'logo_label' : '',
            'show_xaxis': True,
            'xaxis_label': '',
            'show_yaxis': True,
            'yaxis_label': '',
            'yaxis_scale': 'auto',
            'yaxis_tic_interval' : '1.0',
            'show_ends' : True,
            'show_fineprint' : True,
            'color_scheme': 'color_auto',
            'symbols0': '',
            'symbols1': '',
            'symbols2': '',
            'symbols3': '',
            'symbols4': '',
            'color0': '',
            'color1': '',
            'color2': '',
            'color3': '',
            'color4': '',

        Nz6The version parameter is deprecated and has no effect.ACDEFGHIKLMNPQRSTVWYalphabet_proteinACGUalphabet_rnar	   alphabet_dnaautor   z+https://weblogo.threeplusone.com/create.cgir:   formatstack_widthmediumstacks_per_line40r   ignore_lower_caseT	unit_namebitsfirst_index1
logo_startlogo_endcomposition	comp_auto	percentCGr>   scale_widthshow_errorbars
logo_title
logo_label
show_xaxisxaxis_label
show_yaxisyaxis_labelyaxis_scaleyaxis_tic_intervalz1.0	show_endsshow_fineprintcolor_scheme
color_autosymbols0symbols1symbols2symbols3symbols4color0color1color2color3color4c                 S   s&   i | ]\}}||d u rdnt |qS )Fr>   r@   )ri   kvr   r   r   ru     s   & z!Motif.weblogo.<locals>.<dictcomp>zutf-8wb)r1   r2   r   setr   r   r   r4   r8   updater   r   encoder   r   openr    write)r9   fnamer#   versionkwdsalpharX   urlr   datareqresponsefimr   r   r   weblogoV  s   1
	

 !"#&
"zMotif.weblogoc                 C   s   |dv rddl m} | g}|||S |dkr%ddl m} | g}||S |dkr7ddl m} | g}||S |s=t| S td| )	[  Return a string representation of the Motif in the given format.

        Currently supported formats:
         - clusterbuster: Cluster Buster position frequency matrix format
         - pfm : JASPAR single Position Frequency Matrix
         - jaspar : JASPAR multiple Position Frequency Matrix
         - transfac : TRANSFAC like files

        r   r   r   r   r   r   r   r   Unknown format type %s)r   r   r   r   r   r4   r!   )r9   format_specr   r(   r   r   r   r   r   
__format__  s   


zMotif.__format__c                 C   s
   |  |S )r   )r   )r9   r   r   r   r   r     s   

zMotif.format)r	   NNN)F)r   N)rP   rQ   rR   rS   r=   _Motif__get_mask_Motif__set_maskpropertyr^   _Motif__get_pseudocounts_Motif__set_pseudocountsr\   _Motif__get_background_Motif__set_backgroundr]   r   r   r   r   rB   r   r   r   r   r   r   r   r   r   r   r   r   r   r   %  sN    
;

	
'










pr   c                 C   sp   |  }|dv rddlm} || |S |dkr#ddlm} || S |dkr2ddlm} || S td| )	aJ  Return a string representation of motifs in the given format.

    Currently supported formats (case is ignored):
     - clusterbuster: Cluster Buster position frequency matrix format
     - pfm : JASPAR simple single Position Frequency Matrix
     - jaspar : JASPAR multiple PFM format
     - transfac : TRANSFAC like files

    r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r!   )r(   r#   r   r   r   r   r   r   r     s   


r   __main__)run_doctest)verbose)r	   )T)rS   r1   urllib.parser   urllib.requestr   r   numpyrY   ImportErrorBior   r   	Bio.Alignr   r0   r   r   r%   r    listr*   r   r   rP   
Bio._utilsr   r   r   r   r   <module>   s>   	


Y:g   P