o
    RŀgLm                     @   sv   d Z ddlZddlmZ ddlmZ zddlZW n ey#   edw ddlm	Z	 ddlm
Z
 dZG d	d
 d
ZdS )a  Provides read access to a JASPAR5 formatted database.

This modules requires MySQLdb to be installed.

Example, substitute the your database credentials as
appropriate::

        from Bio.motifs.jaspar.db import JASPAR5
        JASPAR_DB_HOST = "hostname.example.org"
        JASPAR_DB_NAME = "JASPAR2018"
        JASPAR_DB_USER = "guest"
        JASPAR_DB_PASS = "guest"

        jdb = JASPAR5(
            host=JASPAR_DB_HOST,
            name=JASPAR_DB_NAME,
            user=JASPAR_DB_USER,
            password=JASPAR_DB_PASS
        )
        ets1 = jdb.fetch_motif_by_id('MA0098')
        print(ets1)
    TF name ETS1
    Matrix ID   MA0098.3
    Collection  CORE
    TF class    ['Tryptophan cluster factors']
    TF family   ['Ets-related factors']
    Species 9606
    Taxonomic group vertebrates
    Accession   ['P14921']
    Data type used  HT-SELEX
    Medline 20517297
    PAZAR ID    TF0000070
    Comments    Data is from Taipale HTSELEX DBD (2013)
    Matrix:
            0      1      2      3      4      5      6      7      8      9
    A: 2683.00 180.00 425.00   0.00   0.00 2683.00 2683.00 1102.00  89.00 803.00
    C: 210.00 2683.00 2683.00  21.00   0.00   0.00   9.00  21.00 712.00 401.00
    G: 640.00 297.00   7.00 2683.00 2683.00   0.00  31.00 1580.00 124.00 1083.00
    T: 241.00  22.00   0.00   0.00  12.00   0.00 909.00  12.00 1970.00 396.00

        motifs = jdb.fetch_motifs(
            collection = 'CORE',
            tax_group = ['vertebrates', 'insects'],
            tf_class = 'Homeo domain factors',
            tf_family = ['TALE-type homeo domain factors', 'POU domain factors'],
            min_ic = 12
        )
        for motif in motifs:
            pass # do something with the motif
    N)BiopythonWarning)MissingPythonDependencyErrorz7Install MySQLdb if you want to use Bio.motifs.jaspar.db)jaspar)matrixCOREc                   @   s   e Zd ZdZdddZdd Zdd Zd	d
 ZeddddddddddddddfddZ	dd Z
dd Zdd Zdd ZedddddddddddfddZdd ZdS )JASPAR5am  Class representing a JASPAR5 database.

    Class representing a JASPAR5 DB. The methods within are loosely based
    on the perl TFBS::DB::JASPAR5 module.

    Note: We will only implement reading of JASPAR motifs from the DB.
    Unlike the perl module, we will not attempt to implement any methods to
    store JASPAR motifs or create a new DB at this time.
    Nc                 C   s.   || _ || _|| _|| _t||||| _dS )a  Construct a JASPAR5 instance and connect to specified DB.

        Arguments:
         - host - host name of the the JASPAR DB server
         - name - name of the JASPAR database
         - user - user name to connect to the JASPAR DB
         - password - JASPAR DB password

        N)namehostuserpasswordmdbconnectdbh)selfr	   r   r
   r    r   H/var/www/html/myenv/lib/python3.10/site-packages/Bio/motifs/jaspar/db.py__init__W   s
   
zJASPAR5.__init__c                 C   s   | j  d| j d| j S )z<Return a string representation of the JASPAR5 DB connection.z\@:)r
   r	   r   )r   r   r   r   __str__h   s   zJASPAR5.__str__c                 C   sF   t |\}}|s| |}d}|r| ||}d}|r!| |}|S )a  Fetch a single JASPAR motif from the DB by its JASPAR matrix ID.

        Example id 'MA0001.1'.

        Arguments:
         - id - JASPAR matrix ID. This may be a fully specified ID including
                the version number (e.g. MA0049.2) or just the base ID (e.g.
                MA0049). If only a base ID is provided, the latest version is
                returned.

        Returns:
         - A Bio.motifs.jaspar.Motif object

        **NOTE:** The perl TFBS module allows you to specify the type of matrix
        to return (PFM, PWM, ICM) but matrices are always stored in JASPAR as
        PFMs so this does not really belong here. Once a PFM is fetched the
        pwm() and pssm() methods can be called to return the normalized and
        log-odds matrices.

        N)r   split_jaspar_id_fetch_latest_version_fetch_internal_id_fetch_motif_by_internal_id)r   idbase_idversionint_idmotifr   r   r   fetch_motif_by_idl   s   

zJASPAR5.fetch_motif_by_idc                 C   s   | j d|dS )a  Fetch a list of JASPAR motifs from a JASPAR DB by the given TF name(s).

        Arguments:
        name - a single name or list of names
        Returns:
        A list of Bio.motifs.jaspar.Motif objects

        Notes:
        Names are not guaranteed to be unique. There may be more than one
        motif with the same name. Therefore even if name specifies a single
        name, a list of motifs is returned. This just calls
        self.fetch_motifs(collection = None, tf_name = name).

        This behaviour is different from the TFBS perl module's
        get_Matrix_by_name() method which always returns a single matrix,
        issuing a warning message and returning the first matrix retrieved
        in the case where multiple matrices have the same name.

        N)
collectiontf_name)fetch_motifs)r   r   r   r   r   fetch_motifs_by_name   s   zJASPAR5.fetch_motifs_by_namer   Fc                    s   | j |||||||||	|
||d}t }	 |D ]2}| | |r) j |k r)q|r1 j|k r1q	 |rEt fdd jD }||k rEq|	  q|S )a  Fetch jaspar.Record (list) of motifs using selection criteria.

        Arguments::

            Except where obvious, all selection criteria arguments may be
            specified as a single value or a list of values. Motifs must
            meet ALL the specified selection criteria to be returned with
            the precedent exceptions noted below.

            all         - Takes precedent of all other selection criteria.
                          Every motif is returned. If 'all_versions' is also
                          specified, all versions of every motif are returned,
                          otherwise just the latest version of every motif is
                          returned.
            matrix_id   - Takes precedence over all other selection criteria
                          except 'all'.  Only motifs with the given JASPAR
                          matrix ID(s) are returned. A matrix ID may be
                          specified as just a base ID or full JASPAR IDs
                          including version number. If only a base ID is
                          provided for specific motif(s), then just the latest
                          version of those motif(s) are returned unless
                          'all_versions' is also specified.
            collection  - Only motifs from the specified JASPAR collection(s)
                          are returned. NOTE - if not specified, the collection
                          defaults to CORE for all other selection criteria
                          except 'all' and 'matrix_id'. To apply the other
                          selection criteria across all JASPAR collections,
                          explicitly set collection=None.
            tf_name     - Only motifs with the given name(s) are returned.
            tf_class    - Only motifs of the given TF class(es) are returned.
            tf_family   - Only motifs from the given TF families are returned.
            tax_group   - Only motifs belonging to the given taxonomic
                          supergroups are returned (e.g. 'vertebrates',
                          'insects', 'nematodes' etc.)
            species     - Only motifs derived from the given species are
                          returned.  Species are specified as taxonomy IDs.
            data_type   - Only motifs generated with the given data type (e.g.
                          ('ChIP-seq', 'PBM', 'SELEX' etc.) are returned.
                          NOTE - must match exactly as stored in the database.
            pazar_id    - Only motifs with the given PAZAR TF ID are returned.
            medline     - Only motifs with the given medline (PubmMed IDs) are
                          returned.
            min_ic      - Only motifs whose profile matrices have at least this
                          information content (specificty) are returned.
            min_length  - Only motifs whose profiles are of at least this
                          length are returned.
            min_sites   - Only motifs compiled from at least these many binding
                          sites are returned.
            all_versions- Unless specified, just the latest version of motifs
                          determined by the other selection criteria are
                          returned. Otherwise all versions of the selected
                          motifs are returned.

        Returns:
            - A Bio.motifs.jaspar.Record (list) of motifs.

        )r   r    tf_class	tf_family	matrix_id	tax_groupspeciespazar_id	data_typemedlineallall_versionsc                 3   s    | ]
} j | d  V  qdS )r   N)counts).0ntr   r   r   	<genexpr>   s    z'JASPAR5.fetch_motifs.<locals>.<genexpr>)
_fetch_internal_id_listr   Recordr   pssmmeanlengthsumalphabetappend)r   r   r    r#   r$   r%   r&   r'   r(   r)   r*   min_ic
min_length	min_sitesr+   r,   int_idsrecordr   	num_sitesr   r0   r   r!      s>   L

zJASPAR5.fetch_motifsc                 C   sL   | j  }|d|f | }d}|r|d }|S td| dt |S )z>Get the latest version number for the given base_id (PRIVATE).zKselect VERSION from MATRIX where BASE_id = %s order by VERSION desc limit 1Nr   zEFailed to fetch latest version number for JASPAR motif with base ID 'zF'. No JASPAR motif with this base ID appears to exist in the database.r   cursorexecutefetchonewarningswarnr   )r   r   currowlatestr   r   r   r   (  s"   
	zJASPAR5._fetch_latest_versionc                 C   sT   | j  }|d||f | }d}|r|d }|S td| d| dt |S )zsFetch the internal id for a base id + version (PRIVATE).

        Also checks if this combo exists or not.
        z9select id from MATRIX where BASE_id = %s and VERSION = %sNr   zFFailed to fetch internal database ID for JASPAR motif with matrix ID '.z8'. No JASPAR motif with this matrix ID appears to exist.r@   )r   r   r   rF   rG   r   r   r   r   r   @  s&   
	zJASPAR5._fetch_internal_idc                 C   s  | j  }|d|f | }|std| t dS |d }|d }|d }|d }d|d	t|g}| 	|}	t
j||||	d
}
|d|f g }| }|D ]	}||d  qT||
_|d|f g }| }|D ]	}||d  qp||
_|d|f | }g }g }|D ]G}|d }|d }|dkr|| q|dkr|| q|dkr||
_q|dkr||
_q|dkr||
_q|dkr||
_q|dkr||
_q	 q||
_||
_|
S )z(Fetch basic motif information (PRIVATE).zCselect BASE_ID, VERSION, COLLECTION, NAME from MATRIX where id = %sz0Could not fetch JASPAR motif with internal ID = Nr             rI   )r   r-   z/select TAX_ID from MATRIX_SPECIES where id = %sz,select ACC FROM MATRIX_PROTEIN where id = %sz4select TAG, VAL from MATRIX_ANNOTATION where id = %sclassfamilyr&   typepazar_tf_idr*   comment)r   rA   rB   rC   rD   rE   r   joinstr_fetch_counts_matrixr   Motiffetchallr9   r'   accr&   r)   r(   r*   rR   r$   r#   )r   r   rF   rG   r   r   r   r   r%   r-   r   tax_idsrowsaccsr$   r#   attrvalr   r   r   r   Z  sp   

z#JASPAR5._fetch_motif_by_internal_idc                 C   sj   i }| j  }dD ]%}g }|d||f | }|D ]	}||d  qdd |D ||< q	td|S )zFetch the counts matrix from the JASPAR DB by the internal ID (PRIVATE).

        Returns a Bio.motifs.matrix.GenericPositionMatrix
        ACGTzCselect val from MATRIX_DATA where ID = %s and row = %s order by colr   c                 S   s   g | ]}t |qS r   )float)r.   xr   r   r   
<listcomp>  s    z0JASPAR5._fetch_counts_matrix.<locals>.<listcomp>)r   rA   rB   rW   r9   r   GenericPositionMatrix)r   r   r-   rF   basebase_countsrZ   rG   r   r   r   rU     s   
zJASPAR5._fetch_counts_matrixc                 C   sP  g }| j  }	 |r!|d | }|D ]	}||d  q|S 	 |rr	 |rL|D ] }t|\}}|d|f | }|D ]	}||d  q?q)|S |D ]!}t|\}}|s^| |}d}|rh| ||}|ro|| qN|S dg}g }|rt	|t
rd}d|d|g}d|d	g}nd
| }|| |rt	|t
rd}d|d|g}d|d	g}nd| }|| |r|d |d 	 t	|t
rd}d|ddd |D g}d|d	g}nd| }|| 	 |r0|d |d d}t	|t
r"d|dg}d|d|g}d|d	g}n	d|d| g}|| |rl|d |d d}t	|t
r^d|dg}d|d|g}d|d	g}n	d|d| g}|| |r|d |d d}t	|t
rd|d g}d|d|g}d|d	g}ndd!| g}|| |
r|d" |d# d$}t	|
t
rd|d%g}d|d|
g}d|d	g}ndd&|
 g}|| |	r|d' |d( d)}t	|	t
rd|d*g}d|d|	g}d|d	g}ndd+|	 g}|| |rY|d, |d- d.}t	|t
rKd|d/g}d|d|g}d|d	g}n	d|d0| g}|| dd1d2|g}|rqd|d3d4|g}|| | }|D ]}|d }|r|| q|| |r|| q|t|d5k rtd6t |S )7a   Fetch list of internal JASPAR motif IDs.

        Fetch a list of internal JASPAR motif IDs based on various passed
        parameters which may then be used to fetch the rest of the motif data.

        Caller:
            fetch_motifs()

        Arguments:
            See arguments sections of fetch_motifs()

        Returns:
            A list of internal JASPAR motif IDs which match the given
            selection criteria arguments.


        Build an SQL query based on the selection arguments provided.

        1: First add table joins and sub-clauses for criteria corresponding to
           named fields from the MATRIX and MATRIX_SPECIES tables such as
           collection, matrix ID, name, species etc.

        2: Then add joins/sub-clauses for tag/value parameters from the
           MATRIX_ANNOTATION table.

        For the surviving matrices, the responsibility to do matrix-based
        feature filtering such as ic, number of sites etc, fall on the
        calling fetch_motifs() method.

        zselect ID from MATRIXr   z(select ID from MATRIX where BASE_ID = %sNzMATRIX mzm.COLLECTION in ('rM   z','z')zm.COLLECTION = '%s'zm.NAME in ('zm.NAME = '%s'zMATRIX_SPECIES mszm.ID = ms.IDzms.TAX_ID in ('c                 s   s    | ]}t |V  qd S )N)rT   )r.   sr   r   r   r1   \  s    z2JASPAR5._fetch_internal_id_list.<locals>.<genexpr>zms.TAX_ID = '%s'zMATRIX_ANNOTATION ma1zm.ID = ma1.IDzma1.TAG = 'class'z and ma1.VAL in ('z and ma1.VAL = '%s' zMATRIX_ANNOTATION ma2zm.ID = ma2.IDzma2.TAG = 'family'z and ma2.VAL in ('z and ma2.VAL = '%s' zMATRIX_ANNOTATION ma3zm.ID = ma3.IDzma3.TAG = 'pazar_tf_id'z and ma3.VAL in ('z and ma3.VAL = '%s' zMATRIX_ANNOTATION ma4zm.ID = ma4.IDzma4.TAG = 'medline'z and ma4.VAL in ('z and ma4.VAL = '%s' zMATRIX_ANNOTATION ma5zm.ID = ma5.IDzma5.TAG = 'type'z and ma5.VAL in ('z and ma5.VAL = '%s' zMATRIX_ANNOTATION ma6zm.ID = ma6.IDzma6.TAG = 'tax_group'z and ma6.VAL in ('z and ma6.VAL = '%s' zselect distinct(m.ID) from z, z where z and rJ   z1Zero motifs returned with current select criteria)r   rA   rB   rW   r9   r   r   r   r   
isinstancelistrS   _is_latest_versionlenrD   rE   r   )r   r   r    r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r=   rF   rZ   rG   r   r   r   r   tableswhere_clausesclausesqlr   r   r   r2     s  -































zJASPAR5._fetch_internal_id_listc                 C   s:   | j  }|d||f | }|d }|dkrdS dS )zCheck if the internal ID represents the latest JASPAR matrix (PRIVATE).

        Does this internal ID represent the latest version of the JASPAR
        matrix (collapse on base ids)
        zselect count(*) from MATRIX where BASE_ID = (select BASE_ID from MATRIX where ID = %s) and VERSION > (select VERSION from MATRIX where ID = %s)r   TF)r   rA   rB   rC   )r   r   rF   rG   countr   r   r   rh     s   
zJASPAR5._is_latest_version)NNNN)__name__
__module____qualname____doc__r   r   r   r"   JASPAR_DFLT_COLLECTIONr!   r   r   r   rU   r2   rh   r   r   r   r   r   L   sT    

'
^
  +r   )rr   rD   Bior   r   MySQLdbr   ImportError
Bio.motifsr   r   rs   r   r   r   r   r   <module>   s   3