package org.snpeff.nextProt;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.forester.io.parsers.phyloxml.PhyloXmlMapping;
import org.forester.io.parsers.phyloxml.PhyloXmlUtil;
import org.snpeff.codons.CodonTables;
import org.snpeff.collections.AutoHashMap;
import org.snpeff.interval.Gene;
import org.snpeff.interval.Genome;
import org.snpeff.interval.Marker;
import org.snpeff.interval.Markers;
import org.snpeff.interval.NextProt;
import org.snpeff.interval.Transcript;
import org.snpeff.snpEffect.Config;
import org.snpeff.stats.CountByType;
import org.snpeff.util.Gpr;
import org.snpeff.util.GprSeq;
import org.snpeff.util.Timer;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

/* loaded from: input_file:org/snpeff/nextProt/NextProtParser.class */
public class NextProtParser {
    public static final double HIGHLY_CONSERVED_AA_PERCENT = 0.99d;
    public static final int HIGHLY_CONSERVED_AA_COUNT = 30;
    public static final String[] CATAGORY_BLACK_LIST_STR = {"", "expression-info", "mature-protein", "mature protein", "mutagenesis site", "mutagenesis-site", "mutagenesis", "pdb-mapping", "peptide-mapping", "retained intron", "sequence conflict", "sequence-conflict", "sequence variant", "sequence-variant", "srm-peptide-mapping", "variant"};
    protected String NODE_NAME_PROTEIN;
    protected String NODE_NAME_GENE;
    protected String NODE_NAME_TRANSCRIPT;
    protected String NODE_NAME_ANNOTATION;
    protected String NODE_NAME_ANNOTATION_LIST;
    protected String NODE_NAME_POSITION;
    protected String NODE_NAME_PROPERTY;
    protected String NODE_NAME_DESCRIPTION;
    protected String NODE_NAME_CVNAME;
    protected String NODE_NAME_SEQUENCE;
    protected String NODE_NAME_XREF;
    protected String ATTR_NAME_UNIQUE_NAME;
    protected String ATTR_NAME_DATABASE;
    protected String ATTR_NAME_ACCESSION;
    protected String ATTR_NAME_ANNOTATION_LIST;
    protected String ATTR_NAME_CATAGORY;
    protected String ATTR_NAME_FIRST;
    protected String ATTR_NAME_LAST;
    protected String ATTR_NAME_ISOFORM_REF;
    protected String ATTR_NAME_PROPERTY_NAME;
    protected String ATTR_NAME_VALUE;
    protected String ATTR_VALUE_ENSEMBL;
    protected String ATTR_VALUE_REFSEQ;
    protected String ATTR_VALUE_NUCLEOTIDE_SEQUENCE_ID;
    boolean debug;
    boolean verbose;
    String trIdFile;
    HashSet<String> categoryBlackList;
    HashMap<String, String> trIdMap;
    Config config;
    Genome genome;
    int aaErrors;
    HashSet<String> proteinDifferences = new HashSet<>();
    HashSet<String> proteinOk = new HashSet<>();
    HashMap<String, String> trIdByUniqueName = new HashMap<>();
    HashMap<String, String> sequenceByUniqueName = new HashMap<>();
    AutoHashMap<String, CountByType> countAaSequenceByType = new AutoHashMap<>(new CountByType());
    HashMap<String, Transcript> trById = new HashMap<>();
    Markers markers = new Markers();

    public NextProtParser(Config config) {
        this.config = config;
        this.genome = config.getGenome();
        defineNextProtXmlTerms();
        this.categoryBlackList = new HashSet<>();
        for (String str : CATAGORY_BLACK_LIST_STR) {
            this.categoryBlackList.add(str);
        }
    }

    void addTr(Transcript transcript) {
        String id = transcript.getId();
        this.trById.put(id, transcript);
        String str = this.trIdMap.get(id);
        if (str != null) {
            this.trById.put(str, transcript);
        }
        if (id.indexOf(46) > 0) {
            String str2 = this.trIdMap.get(id.split("\\.")[0]);
            if (str2 != null) {
                this.trById.put(str2, transcript);
            }
        }
    }

    void addTranscripts() {
        readTrIdMap();
        Iterator<Gene> it = this.config.getSnpEffectPredictor().getGenome().getGenes().iterator();
        while (it.hasNext()) {
            Iterator<Transcript> it2 = it.next().iterator();
            while (it2.hasNext()) {
                addTr(it2.next());
            }
        }
    }

    void analyzeSequenceConservation() {
        if (this.verbose) {
            Timer.showStdErr("Sequence conservation analysis.\n\tAA sequence length  : 1\n\tMin AA count        : 30\n\tMin AA conservation : 0.99");
        }
        ArrayList arrayList = new ArrayList();
        arrayList.addAll(this.countAaSequenceByType.keySet());
        Collections.sort(arrayList);
        StringBuilder sb = new StringBuilder();
        for (char c : GprSeq.AMINO_ACIDS) {
            sb.append(c + "\t");
        }
        sb.append("\t" + ((Object) sb));
        if (this.verbose) {
            System.out.println("Amino acid regions:\n\tTotal\tMax count\tAvg len\tConservation\tCatergory\tControlled Vocabulary\t" + ((Object) sb) + "\tOther AA sequences:");
        }
        Iterator it = arrayList.iterator();
        while (it.hasNext()) {
            String str = (String) it.next();
            long j = 0;
            long j2 = 0;
            long j3 = 0;
            CountByType countByType = this.countAaSequenceByType.get(str);
            long sum = countByType.sum();
            boolean z = false;
            StringBuilder sb2 = new StringBuilder();
            for (char c2 : GprSeq.AMINO_ACIDS) {
                long j4 = countByType.get("" + c2);
                if (j4 > 0) {
                    j += 1 * j4;
                    j2 += j4;
                    j3 = Math.max(j3, j4);
                    sb2.append(j4);
                    if (j4 / sum > 0.99d && sum >= 30) {
                        z = true;
                    }
                }
                sb2.append("\t");
            }
            for (String str2 : countByType.keySet()) {
                long j5 = countByType.get(str2);
                double d = j5 / sum;
                if (str2.length() > 1) {
                    j += str2.length() * j5;
                    j2 += j5;
                    j3 = Math.max(j3, j5);
                    sb2.append(String.format("\t" + str2 + ":" + j5, new Object[0]));
                    if (d > 0.99d && sum >= 30) {
                        z = true;
                    }
                }
            }
            long j6 = j / j2;
            if (this.verbose) {
                System.out.println("\t" + sum + "\t" + j3 + "\t" + j6 + "\t" + (z ? "High" : "") + "\t" + str + "\t" + ((Object) sb2));
            }
            if (z) {
                int i = 0;
                Iterator<Marker> it2 = this.markers.iterator();
                while (it2.hasNext()) {
                    Marker next = it2.next();
                    NextProt nextProt = (NextProt) next;
                    if (next.getId().equals(str)) {
                        nextProt.setHighlyConservedAaSequence(true);
                        i++;
                    }
                }
                if (this.verbose) {
                    Timer.showStdErr("NextProt " + i + " markers type '" + str + "' marked as highly conserved AA sequence");
                }
            }
        }
    }

    void countAaSequence(String str, String str2, String str3, String str4) {
        this.countAaSequenceByType.getOrCreate(key(str, str2, str3)).inc(str4);
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public void defineNextProtXmlTerms() {
        this.NODE_NAME_PROTEIN = PhyloXmlUtil.SEQ_TYPE_PROTEIN;
        this.NODE_NAME_GENE = "gene";
        this.NODE_NAME_TRANSCRIPT = "transcript";
        this.NODE_NAME_ANNOTATION = PhyloXmlMapping.ANNOTATION;
        this.NODE_NAME_ANNOTATION_LIST = "annotationList";
        this.NODE_NAME_POSITION = "position";
        this.NODE_NAME_PROPERTY = PhyloXmlMapping.PROPERTY;
        this.NODE_NAME_DESCRIPTION = PhyloXmlMapping.PHYLOGENY_DESCRIPTION;
        this.NODE_NAME_CVNAME = "cvName";
        this.NODE_NAME_SEQUENCE = PhyloXmlMapping.SEQUENCE;
        this.NODE_NAME_XREF = "xref";
        this.ATTR_NAME_UNIQUE_NAME = "uniqueName";
        this.ATTR_NAME_DATABASE = Config.KEY_DATABASE_LOCAL;
        this.ATTR_NAME_ACCESSION = PhyloXmlMapping.ACCESSION;
        this.ATTR_NAME_ANNOTATION_LIST = "annotationList";
        this.ATTR_NAME_CATAGORY = "category";
        this.ATTR_NAME_FIRST = "first";
        this.ATTR_NAME_LAST = "last";
        this.ATTR_NAME_ISOFORM_REF = "isoformRef";
        this.ATTR_NAME_PROPERTY_NAME = "propertyName";
        this.ATTR_NAME_VALUE = PhyloXmlMapping.CLADE_DATE_VALUE;
        this.ATTR_VALUE_ENSEMBL = "Ensembl";
        this.ATTR_VALUE_REFSEQ = "RefSeq";
        this.ATTR_VALUE_NUCLEOTIDE_SEQUENCE_ID = "'nucleotide sequence ID";
    }

    protected void fatalError(String str) {
        System.err.println("Fatal error: " + str);
        System.exit(-1);
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public ArrayList<Node> findNodes(Node node, String str, String str2, String str3, String str4) {
        ArrayList<Node> arrayList = new ArrayList<>();
        while (node != null) {
            boolean z = false;
            short nodeType = node.getNodeType();
            String nodeName = node.getNodeName();
            String nodeValue = node.getNodeValue();
            if (nodeValue != null) {
                nodeValue = nodeValue.replace('\n', ' ').trim();
            }
            StringBuilder sb = new StringBuilder();
            if (str3 != null || str4 != null) {
                NamedNodeMap attributes = node.getAttributes();
                if (attributes != null) {
                    for (int i = 0; i < attributes.getLength(); i++) {
                        Node item = attributes.item(i);
                        if (sb.length() > 0) {
                            sb.append(", ");
                        }
                        String nodeName2 = item.getNodeName();
                        String nodeValue2 = item.getNodeValue();
                        sb.append(nodeName2 + "=" + nodeValue2);
                        if ((str == null || (nodeName != null && nodeName.equals(str))) && ((str2 == null || (nodeValue != null && nodeValue.equals(str2))) && ((str3 == null || (nodeName2 != null && str3.equals(nodeName2))) && (str4 == null || (nodeValue2 != null && str4.equals(nodeValue2)))))) {
                            z = true;
                        }
                    }
                }
            } else if ((str == null || (nodeName != null && nodeName.equals(str))) && (str2 == null || (nodeValue != null && nodeValue.equals(str2)))) {
                z = true;
            }
            if (z) {
                arrayList.add(node);
            }
            switch (nodeType) {
                case 1:
                    arrayList.addAll(findNodes(node.getChildNodes(), str, str2, str3, str4));
                    node = node.getNextSibling();
                    break;
                case 2:
                default:
                    node = null;
                    break;
                case 3:
                    node = null;
                    break;
                case 4:
                    node = null;
                    break;
            }
        }
        return arrayList;
    }

    List<Node> findNodes(NodeList nodeList, String str, String str2, String str3, String str4) {
        ArrayList arrayList = new ArrayList();
        for (int i = 0; i < nodeList.getLength(); i++) {
            arrayList.addAll(findNodes(nodeList.item(i), str, str2, str3, str4));
        }
        return arrayList;
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public Node findOneNode(Node node, String str, String str2, String str3, String str4) {
        ArrayList<Node> findNodes = findNodes(node, str, str2, str3, str4);
        if (findNodes.isEmpty()) {
            return null;
        }
        return findNodes.get(0);
    }

    void findSequences(Node node) {
        for (Node node2 : findNodes(node, this.NODE_NAME_SEQUENCE, (String) null, (String) null, (String) null)) {
            String text = getText(node2);
            this.sequenceByUniqueName.put(getUniqueNameSequence(node2), text);
        }
    }

    boolean findTrIds(Node node) {
        boolean z = false;
        for (Node node2 : findNodes(node, this.NODE_NAME_TRANSCRIPT, (String) null, this.ATTR_NAME_DATABASE, this.ATTR_VALUE_ENSEMBL)) {
            this.trIdByUniqueName.put(getUniqueNameTranscript(node2), getAttribute(node2, this.ATTR_NAME_ACCESSION));
            z = true;
        }
        return z;
    }

    protected int getAaEnd(Node node) {
        return Gpr.parseIntSafe(getAttribute(node, this.ATTR_NAME_LAST)) - 1;
    }

    protected int getAaStart(Node node) {
        return Gpr.parseIntSafe(getAttribute(node, this.ATTR_NAME_FIRST)) - 1;
    }

    String getAnnDescription(Node node) {
        String text = getText(findOneNode(node, this.NODE_NAME_DESCRIPTION, null, null, null));
        if (text == null) {
            text = "";
        } else if (text.indexOf(59) > 0) {
            text = text.substring(0, text.indexOf(59));
        }
        return text;
    }

    List<Node> getAnnotationCategories(Node node) {
        return findNodes(node, this.NODE_NAME_ANNOTATION_LIST, (String) null, (String) null, (String) null);
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public String getAttribute(Node node, String str) {
        NamedNodeMap attributes;
        Node namedItem;
        if (node == null || (attributes = node.getAttributes()) == null || (namedItem = attributes.getNamedItem(str)) == null) {
            return null;
        }
        return namedItem.getNodeValue();
    }

    String getControlledVocubulary(Node node) {
        String text = getText(findOneNode(node, this.NODE_NAME_CVNAME, null, null, null));
        if (text == null) {
            text = "";
        }
        text.indexOf(59);
        String[] split = text.split(";", 2);
        return split.length > 1 ? split[0] : text;
    }

    String getGeneId(Node node, String str) {
        return getAttribute(findOneNode(node, this.NODE_NAME_GENE, null, this.ATTR_NAME_DATABASE, this.ATTR_VALUE_ENSEMBL), this.ATTR_NAME_ACCESSION);
    }

    protected String getIsoformRefFromPos(Node node) {
        return getAttribute(node.getParentNode().getParentNode(), this.ATTR_NAME_ISOFORM_REF);
    }

    public Markers getMarkers() {
        return this.markers;
    }

    String getText(Node node) {
        if (node == null) {
            return null;
        }
        return node.getTextContent().replace('\n', ' ').trim();
    }

    String getUniqueNameSequence(Node node) {
        return getAttribute(node.getParentNode(), this.ATTR_NAME_UNIQUE_NAME);
    }

    String getUniqueNameTranscript(Node node) {
        return getAttribute(node.getParentNode(), this.ATTR_NAME_UNIQUE_NAME);
    }

    String key(String str, String str2, String str3) {
        String vcfSafe = vcfSafe(str);
        if (str3 == null || str3.isEmpty()) {
            str3 = str2;
        }
        String vcfSafe2 = vcfSafe(str3);
        return vcfSafe2.isEmpty() ? vcfSafe : vcfSafe + ":" + vcfSafe2;
    }

    String nodeType(short s) {
        switch (s) {
            case 1:
                return "ELEMENT_NODE";
            case 2:
                return "ATTRIBUTE_NODE";
            case 3:
                return "TEXT_NODE";
            case 4:
                return "CDATA_SECTION_NODE";
            case 5:
                return "ENTITY_REFERENCE_NODE";
            case 6:
                return "ENTITY_NODE";
            case 7:
                return "PROCESSING_INSTRUCTION_NODE";
            case 8:
                return "COMMENT_NODE";
            case 9:
                return "DOCUMENT_NODE";
            case 10:
                return "DOCUMENT_TYPE_NODE";
            case 11:
                return "DOCUMENT_FRAGMENT_NODE";
            case 12:
                return "NOTATION_NODE";
            case 13:
            case 14:
            case 15:
            default:
                throw new RuntimeException("Unknown");
            case 16:
                return "DOCUMENT_POSITION_CONTAINED_BY";
        }
    }

    public void parse(Node node) {
        addTranscripts();
        if (this.verbose) {
            Timer.showStdErr("Parsing XML data.");
        }
        List<Node> findNodes = findNodes(node.getChildNodes(), this.NODE_NAME_PROTEIN, (String) null, (String) null, (String) null);
        if (this.verbose) {
            Timer.showStdErr("Found " + findNodes.size() + " protein nodes");
        }
        for (Node node2 : findNodes) {
            if (this.debug) {
                Gpr.debug("Processing protein node: " + toString(node2));
            }
            parseProteinNode(node2);
        }
        analyzeSequenceConservation();
    }

    void parseAnnotation(Node node, String str, String str2) {
        if (this.debug) {
            Gpr.debug("\t\tAnnotation: " + toString(node) + "\tCategory: " + str2);
        }
        String annDescription = getAnnDescription(node);
        String controlledVocubulary = getControlledVocubulary(node);
        for (Node node2 : findNodes(node, this.NODE_NAME_POSITION, (String) null, (String) null, (String) null)) {
            if (this.debug) {
                Gpr.debug("\t\t\tPosition: " + toString(node2));
            }
            int aaStart = getAaStart(node2);
            int aaStart2 = getAaStart(node2);
            if (aaStart >= 0 && aaStart2 >= 0) {
                int i = (aaStart2 - aaStart) + 1;
                String isoformRefFromPos = getIsoformRefFromPos(node2);
                String str3 = this.sequenceByUniqueName.get(isoformRefFromPos);
                String str4 = "";
                if (str3 != null && aaStart >= 0 && aaStart2 >= aaStart && aaStart2 < str3.length()) {
                    str4 = str3.substring(aaStart, aaStart2 + 1);
                }
                TranscriptData transcriptData = transcriptData(isoformRefFromPos, aaStart, aaStart2, str3, str4);
                if (transcriptData.ok && i > 0) {
                    NextProt nextProt = new NextProt(transcriptData.tr, transcriptData.chrPosStart, transcriptData.chrPosEnd, key(str2, controlledVocubulary, annDescription));
                    this.markers.add((Marker) nextProt);
                    if (this.debug) {
                        Gpr.debug("Added NextProt entry:" + nextProt + "\n\tgeneId:" + str + "\n\tisoformRef:" + isoformRefFromPos + "\n\ttrId:" + transcriptData.tr.getId() + "\n\tcategory:'" + str2 + "'\n\tdescription:'" + annDescription + "'\n\tcontrolled_vocabulary:" + controlledVocubulary + "\n\taaStart:" + aaStart + "\n\taaEnd:" + aaStart2 + "\n\taaLen:" + i + "\n\tchr:" + transcriptData.chrName + "\n\tstart:" + transcriptData.chrPosStart + "\n\tend:" + transcriptData.chrPosEnd + "\n\tsubSeq:" + str4 + "\n\tcodon:" + transcriptData.codon + "\n\taa:" + transcriptData.aa);
                    }
                    countAaSequence(str2, controlledVocubulary, annDescription, str4);
                }
            }
        }
    }

    void parseAnnotations(Node node, String str) {
        for (Node node2 : getAnnotationCategories(node)) {
            String attribute = getAttribute(node2, this.ATTR_NAME_CATAGORY);
            if (!this.categoryBlackList.contains(attribute)) {
                Iterator<Node> it = findNodes(node2, this.NODE_NAME_ANNOTATION, (String) null, (String) null, (String) null).iterator();
                while (it.hasNext()) {
                    parseAnnotation(it.next(), str, attribute);
                }
            }
        }
    }

    void parseProteinNode(Node node) {
        String attribute = getAttribute(node, this.ATTR_NAME_UNIQUE_NAME);
        if (this.debug) {
            Timer.showStdErr("Parsing protein node: " + attribute);
        }
        String geneId = getGeneId(node, attribute);
        if (geneId != null) {
            if (this.debug) {
                Timer.showStdErr("\tFound matching gene ID: " + geneId);
            }
            if (findTrIds(node)) {
                findSequences(node);
                parseAnnotations(node, geneId);
            }
        }
    }

    void readTrIdMap() {
        this.trIdMap = new HashMap<>();
        if (this.trIdFile == null) {
            return;
        }
        if (this.verbose) {
            Timer.showStdErr("Reading transcripts file '" + this.trIdFile + "'");
        }
        for (String str : Gpr.readFile(this.trIdFile).split(IOUtils.LINE_SEPARATOR_UNIX)) {
            String[] split = str.split("\t");
            if (split.length > 1) {
                this.trIdMap.put(split[1].trim(), split[0].trim());
            }
        }
    }

    public void setDebug(boolean z) {
        this.debug = z;
    }

    public void setTrIdFile(String str) {
        this.trIdFile = str;
    }

    public void setVerbose(boolean z) {
        this.verbose = z;
    }

    String toString(Node node) {
        StringBuilder sb = new StringBuilder();
        String nodeName = node.getNodeName();
        String nodeValue = node.getNodeValue();
        if (nodeValue != null) {
            nodeValue = nodeValue.replace('\n', ' ').trim();
        }
        sb.append(nodeName);
        NamedNodeMap attributes = node.getAttributes();
        if (attributes != null) {
            sb.append("( ");
            for (int i = 0; i < attributes.getLength(); i++) {
                Node item = attributes.item(i);
                String nodeName2 = item.getNodeName();
                String nodeValue2 = item.getNodeValue();
                if (i > 0) {
                    sb.append(", ");
                }
                sb.append(nodeName2 + "='" + nodeValue2 + "'");
            }
            sb.append(" )");
        }
        if (nodeValue != null) {
            sb.append(" = '" + nodeValue + "'\n");
        }
        return sb.toString();
    }

    TranscriptData transcriptData(String str, int i, int i2, String str2, String str3) {
        Transcript transcript;
        String str4 = this.trIdByUniqueName.get(str);
        TranscriptData transcriptData = new TranscriptData();
        if (str4 != null && (transcript = this.trById.get(str4)) != null) {
            transcriptData.tr = transcript;
            String protein = transcript.protein();
            if (!protein.isEmpty() && protein.charAt(protein.length() - 1) == '*') {
                protein = protein.substring(0, protein.length() - 1);
            }
            if (protein.equals(str2)) {
                this.proteinOk.add(str4);
                if (i >= 0 && i2 >= i) {
                    int[] baseNumberCds2Pos = transcript.baseNumberCds2Pos();
                    int i3 = i * 3;
                    int i4 = ((i2 + 1) * 3) - 1;
                    if (i3 < baseNumberCds2Pos.length && i4 < baseNumberCds2Pos.length) {
                        if (transcript.isStrandPlus()) {
                            transcriptData.chrPosStart = baseNumberCds2Pos[i3];
                            transcriptData.chrPosEnd = baseNumberCds2Pos[i4];
                        } else {
                            transcriptData.chrPosStart = baseNumberCds2Pos[i4];
                            transcriptData.chrPosEnd = baseNumberCds2Pos[i3];
                        }
                        transcriptData.chrName = transcript.getChromosomeName();
                        transcriptData.codon = transcript.cds().substring(i3, i4 + 1);
                        transcriptData.aa = CodonTables.getInstance().aa(transcriptData.codon, this.genome, transcriptData.chrName);
                        if (str3.equals(transcriptData.aa) || !this.verbose) {
                            transcriptData.ok = true;
                        } else {
                            Timer.showStdErr("WARNING: AA differ: \tUniqueName : " + str + "\tEnsembl ID : " + str4 + "\tEnsembl  AA: '" + transcriptData.aa + "'\tNextProt AA: '" + str3 + "'\n");
                        }
                    }
                }
            } else {
                if (!this.proteinDifferences.contains(str4) && this.verbose) {
                    Timer.showStdErr("WARNING: Protein sequences differ: \tUniqueName" + str + "\tEnsembl ID: " + str4 + "\n\tEnsembl  (" + protein.length() + "): " + protein + "\n\tNextProt (" + str2.length() + "): " + str2 + IOUtils.LINE_SEPARATOR_UNIX);
                }
                this.proteinDifferences.add(str4);
            }
        }
        return transcriptData;
    }

    String vcfSafe(String str) {
        return str.trim().replaceAll("(,|;|=| |\t)+", "_");
    }
}
