o
    Rŀg1                     @   sX   d Z G dd dZG dd dZG dd dZG dd dZd	d
 Zdd Zdd ZdS )a  Parse Unigene flat file format files such as the Hs.data file.

Here is an overview of the flat file format that this parser deals with:

   Line types/qualifiers::

       ID           UniGene cluster ID
       TITLE        Title for the cluster
       GENE         Gene symbol
       CYTOBAND     Cytological band
       EXPRESS      Tissues of origin for ESTs in cluster
       RESTR_EXPR   Single tissue or development stage contributes
                    more than half the total EST frequency for this gene.
       GNM_TERMINUS genomic confirmation of presence of a 3' terminus;
                    T if a non-templated polyA tail is found among
                    a cluster's sequences; else
                    I if templated As are found in genomic sequence or
                    S if a canonical polyA signal is found on
                      the genomic sequence
       GENE_ID      Entrez gene identifier associated with at least one
                    sequence in this cluster;
                    to be used instead of LocusLink.
       LOCUSLINK    LocusLink identifier associated with at least one
                    sequence in this cluster;
                    deprecated in favor of GENE_ID
       HOMOL        Homology;
       CHROMOSOME   Chromosome.  For plants, CHROMOSOME refers to mapping
                    on the arabidopsis genome.
       STS          STS
            ACC=         GenBank/EMBL/DDBJ accession number of STS
                         [optional field]
            UNISTS=      identifier in NCBI's UNISTS database
       TXMAP        Transcript map interval
            MARKER=      Marker found on at least one sequence in this
                         cluster
            RHPANEL=     Radiation Hybrid panel used to place marker
       PROTSIM      Protein Similarity data for the sequence with
                    highest-scoring protein similarity in this cluster
            ORG=         Organism
            PROTGI=      Sequence GI of protein
            PROTID=      Sequence ID of protein
            PCT=         Percent alignment
            ALN=         length of aligned region (aa)
       SCOUNT       Number of sequences in the cluster
       SEQUENCE     Sequence
            ACC=         GenBank/EMBL/DDBJ accession number of sequence
            NID=         Unique nucleotide sequence identifier (gi)
            PID=         Unique protein sequence identifier (used for
                         non-ESTs)
            CLONE=       Clone identifier (used for ESTs only)
            END=         End (5'/3') of clone insert read (used for
                         ESTs only)
            LID=         Library ID; see Hs.lib.info for library name
                         and tissue
            MGC=         5' CDS-completeness indicator; if present, the
                         clone associated with this sequence is believed
                         CDS-complete. A value greater than 511 is the gi
                         of the CDS-complete mRNA matched by the EST,
                         otherwise the value is an indicator of the
                         reliability of the test indicating CDS
                         completeness; higher values indicate more
                         reliable CDS-completeness predictions.
           SEQTYPE=      Description of the nucleotide sequence.
                         Possible values are mRNA, EST and HTC.
           TRACE=        The Trace ID of the EST sequence, as provided by
                         NCBI Trace Archive

c                   @   *   e Zd ZdZd	ddZdd Zdd ZdS )
SequenceLinea  Store the information for one SEQUENCE line from a Unigene file.

    Initialize with the text part of the SEQUENCE line, or nothing.

    Attributes and descriptions (access as LOWER CASE):
     - ACC=         GenBank/EMBL/DDBJ accession number of sequence
     - NID=         Unique nucleotide sequence identifier (gi)
     - PID=         Unique protein sequence identifier (used for non-ESTs)
     - CLONE=       Clone identifier (used for ESTs only)
     - END=         End (5'/3') of clone insert read (used for ESTs only)
     - LID=         Library ID; see Hs.lib.info for library name and tissue
     - MGC=         5' CDS-completeness indicator; if present,
       the clone associated with this sequence
       is believed CDS-complete. A value greater than 511
       is the gi of the CDS-complete mRNA matched by the EST,
       otherwise the value is an indicator of the reliability
       of the test indicating CDS completeness;
       higher values indicate more reliable CDS-completeness
       predictions.
     - SEQTYPE=     Description of the nucleotide sequence. Possible values
       are mRNA, EST and HTC.
     - TRACE=       The Trace ID of the EST sequence, as provided by NCBI
       Trace Archive

    Nc                 C   sb   d| _ d| _d| _d| _d| _d| _d| _d| _d| _d| _	d| _
|dur/|| _| | dS dS )Initialize the class. FN)accnidlidpidcloneimageis_imageendmgcseqtypetracetext_init_from_textselfr    r   H/var/www/html/myenv/lib/python3.10/site-packages/Bio/UniGene/__init__.py__init__i   s   zSequenceLine.__init__c                 C   sb   | d}|D ]'}| d\}}|dkr&|d d dkr&d| _|dd  | _t| | | qd S )N; =CLONE   IMAGET   )splitr   r
   setattrlowerr   r   partspartkeyvalr   r   r   r   z   s   
zSequenceLine._init_from_textc                 C      | j S )z/Return UniGene SequenceLine object as a string.r   r   r   r   r   __repr__      zSequenceLine.__repr__N__name__
__module____qualname____doc__r   r   r(   r   r   r   r   r   N   s
    

r   c                   @   r   )
ProtsimLinea|  Store the information for one PROTSIM line from a Unigene file.

    Initialize with the text part of the PROTSIM line, or nothing.

    Attributes and descriptions (access as LOWER CASE)
    ORG=         Organism
    PROTGI=      Sequence GI of protein
    PROTID=      Sequence ID of protein
    PCT=         Percent alignment
    ALN=         length of aligned region (aa)
    Nc                 C   s>   d| _ d| _d| _d| _d| _|dur|| _| | dS dS r   r   N)orgprotgiprotidpctalnr   r   r   r   r   r   r      s   zProtsimLine.__init__c                 C   6   | d}|D ]}| d\}}t| | | qd S )Nr   r   r   r   r   r    r   r   r   r      
   
zProtsimLine._init_from_textc                 C   r%   )z.Return UniGene ProtsimLine object as a string.r&   r'   r   r   r   r(      r)   zProtsimLine.__repr__r*   r+   r   r   r   r   r0      s
    
r0   c                   @   r   )
STSLinea;  Store the information for one STS line from a Unigene file.

    Initialize with the text part of the STS line, or nothing.

    Attributes and descriptions (access as LOWER CASE)

    ACC=         GenBank/EMBL/DDBJ accession number of STS [optional field]
    UNISTS=      identifier in NCBI's UNISTS database
    Nc                 C   s,   d| _ d| _|dur|| _| | dS dS r1   )r   unistsr   r   r   r   r   r   r      s   zSTSLine.__init__c                 C   r7   )N r   r8   r    r   r   r   r      r9   zSTSLine._init_from_textc                 C   r%   )z*Return UniGene STSLine object as a string.r&   r'   r   r   r   r(      r)   zSTSLine.__repr__r*   r+   r   r   r   r   r:      s
    

r:   c                   @   s    e Zd ZdZdd Zdd ZdS )Recordat  Store a Unigene record.

    Here is what is stored::

        self.ID           = ''  # ID line
        self.species      = ''  # Hs, Bt, etc.
        self.title        = ''  # TITLE line
        self.symbol       = ''  # GENE line
        self.cytoband     = ''  # CYTOBAND line
        self.express      = []  # EXPRESS line, parsed on ';'
                                # Will be an array of strings
        self.restr_expr   = ''  # RESTR_EXPR line
        self.gnm_terminus = ''  # GNM_TERMINUS line
        self.gene_id      = ''  # GENE_ID line
        self.locuslink    = ''  # LOCUSLINK line
        self.homol        = ''  # HOMOL line
        self.chromosome   = ''  # CHROMOSOME line
        self.protsim      = []  # PROTSIM entries, array of Protsims
                                # Type ProtsimLine
        self.sequence     = []  # SEQUENCE entries, array of Sequence entries
                                # Type SequenceLine
        self.sts          = []  # STS entries, array of STS entries
                                # Type STSLine
        self.txmap        = []  # TXMAP entries, array of TXMap entries

    c                 C   sd   d| _ d| _d| _d| _d| _g | _d| _d| _d| _d| _	d| _
d| _g | _g | _g | _g | _dS r1   )IDspeciestitlesymbolcytobandexpress
restr_exprgnm_terminusgene_id	locuslinkhomol
chromosomeprotsimsequenceststxmapr'   r   r   r   r      s    
zRecord.__init__c                 C   s&   d| j j d| j d| j d| j S )z>Represent the UniGene Record object as a string for debugging.<z> r<   )	__class__r,   r>   rA   r@   r'   r   r   r   r(      s   &zRecord.__repr__N)r,   r-   r.   r/   r   r(   r   r   r   r   r=      s    r=   c                 c   s    	 t | }|s
dS |V  q)zGRead and load a UniGene records, for files containing multiple records.TN)_read)handlerecordr   r   r   parse   s   rS   c                 C   s,   t | }|s
td|  }|rtd|S )z4Read and load a UniGene record, one record per file.zNo SwissProt record foundz$More than one SwissProt record found)rP   
ValueErrorread)rQ   rR   	remainderr   r   r   rU   	  s   rU   c           
      C   s  d}d }| D ]}|d |   ||d    }}|  }|dkr1t }||_|jdd |_q|dkr9||_q|dkrA||_q|dkrI||_q|dkrQ||_q|d	krl|d
kr]d|_	q|dkred|_	qt
d| |dkr|dd |dD |_q|dkrdd |dD |_q|dkr||_q|dkr||_q|dkrt|}|j| q|dkrt|}q|dkrt|}|j| q|dkrt|}	|j|	 q|dkrt|j|krt
d|t|jf |  S t
d| |rt
dd S )N   r>   .    TITLEGENEGENE_ID	LOCUSLINKHOMOLYESTNOzCannot parse HOMOL line EXPRESSc                 S      g | ]}|  qS r   strip.0wordr   r   r   
<listcomp>2      z_read.<locals>.<listcomp>|
RESTR_EXPRc                 S   rb   r   rc   re   r   r   r   rh   4  ri   
CHROMOSOMECYTOBANDPROTSIMSCOUNTSEQUENCESTSz//zkThe number of sequences specified in the record (%d) does not agree with the number of sequences found (%d)zUnknown tag zUnexpected end of stream.)rstripr=   r>   r   r?   r@   rA   rF   rG   rH   rT   rC   rD   rI   rB   r0   rJ   appendintr   rK   r:   rL   len)
rQ   	UG_INDENTrR   linetagvaluerJ   scountrK   rL   r   r   r   rP     sl   "
rP   N)r/   r   r0   r:   r=   rS   rU   rP   r   r   r   r   <module>   s   F;$4	