/*
 * Decompiled with CFR 0.152.
 */
package edu.msu.cme.rdp.classifier.train.validation.leaveoneout;

import edu.msu.cme.rdp.classifier.train.LineageSequenceParser;
import edu.msu.cme.rdp.classifier.train.validation.TreeFactory;
import edu.msu.cme.rdp.classifier.train.validation.leaveoneout.LeaveOneOutTester;
import edu.msu.cme.rdp.readseq.utils.orientation.GoodWordIterator;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.PosixParser;

public class LeaveOneOutTesterMain {
    private static final Options options = new Options();
    public static final String QUERYFILE_LONG_OPT = "queryFile";
    public static final String OUTFILE_LONG_OPT = "outputFile";
    public static final String TRAIN_SEQFILE_LONG_OPT = "trainSeqFile";
    public static final String TRAIN_TAXONFILE_LONG_OPT = "trainTaxonFile";
    public static final String LENGTH_LONG_OPT = "length";
    public static final String QUERYFILE_SHORT_OPT = "q";
    public static final String OUTFILE_SHORT_OPT = "o";
    public static final String TRAIN_SEQFILE_SHORT_OPT = "s";
    public static final String TRAIN_TAXONFILE_SHORT_OPT = "t";
    public static final String LENGTH_SHORT_OPT = "l";
    public static final String HIDETAXON_SHORT_OPT = "h";
    public static final String TRAIN_SEQFILE_DESC = "training files in fasta format labelled with the lineage information. \nThe header of this fasta file starts with '>', followed by the sequence name, white space(s) and a list taxon names seperated by ';' with highest rank taxon first\nex: Root;Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Enterobacter";
    public static final String TRAIN_TAXONFILE_DESC = "contains the hierarchical taxonomy information, taxon name and rank together is unique. \nThe format looks like the following: taxid*taxon name*parent taxid*depth*rank Note taxid, the parent taxid and depth should be in integer format. depth indicates the depth from the root taxon.";
    public static final String LENGTH_DESC = "the default is to test the entire query sequence. if specifiy a length, a region of the query sequence with the specified length will be random choosen for testing";
    public static final String QUERYFILE_DESC = "query file contains sequences, same format as the training sequence file";
    public static final String OUTFILE_DESC = "stat of leave-one-out testing including correctness rate at each rank, misclassified rate for each taxon ";

    public LeaveOneOutTesterMain(String taxFile, String trainseqFile, String testFile, String outFile, int numGoodBases, int min_bootstrap_words, boolean hideTaxon) throws IOException {
        boolean useSeed = true;
        TreeFactory factory = new TreeFactory(new FileReader(taxFile));
        this.createTree(factory, trainseqFile);
        BufferedWriter outWriter = new BufferedWriter(new FileWriter(outFile));
        LineageSequenceParser parser = new LineageSequenceParser(new File(testFile));
        LeaveOneOutTester tester = new LeaveOneOutTester(outWriter, numGoodBases);
        outWriter.write("taxon file: " + taxFile + "\ntrain sequence file: " + trainseqFile + "\n");
        outWriter.write("word size: " + GoodWordIterator.getWordsize() + "\n");
        outWriter.write("minimum number of words for bootstrap: " + min_bootstrap_words + "\n");
        if (numGoodBases > 0) {
            outWriter.write("query sequence file: " + testFile + "\nclassify partial sequence, number of good bases=" + numGoodBases + "\n");
        } else {
            outWriter.write("query sequence file: " + testFile + "\nclassify full-length sequence \n");
        }
        outWriter.write("test rank: " + factory.getLowestRank());
        tester.classify(factory, parser, useSeed, min_bootstrap_words, hideTaxon);
    }

    private void createTree(TreeFactory factory, String input) throws IOException {
        LineageSequenceParser parser = new LineageSequenceParser(new File(input));
        while (parser.hasNext()) {
            factory.addSequence(parser.next());
        }
        factory.calculateWordPrior();
    }

    public static void main(String[] args) throws FileNotFoundException, IOException {
        String queryFile = null;
        String outputFile = null;
        String trainSeqFile = null;
        String trainTaxonFile = null;
        int length = 0;
        int min_bootstrap_words = 5;
        boolean hideTaxon = false;
        try {
            CommandLine line = new PosixParser().parse(options, args);
            if (!line.hasOption(QUERYFILE_SHORT_OPT)) {
                throw new Exception("query file must be specified");
            }
            queryFile = line.getOptionValue(QUERYFILE_SHORT_OPT);
            if (!line.hasOption(OUTFILE_SHORT_OPT)) {
                throw new Exception("output file must be specified");
            }
            outputFile = line.getOptionValue(OUTFILE_SHORT_OPT);
            if (!line.hasOption(TRAIN_SEQFILE_SHORT_OPT)) {
                throw new Exception("training sequence file must be specified");
            }
            trainSeqFile = line.getOptionValue(TRAIN_SEQFILE_SHORT_OPT);
            if (!line.hasOption(TRAIN_TAXONFILE_SHORT_OPT)) {
                throw new Exception("training taxon file must be specified");
            }
            trainTaxonFile = line.getOptionValue(TRAIN_TAXONFILE_SHORT_OPT);
            if (line.hasOption(LENGTH_SHORT_OPT) && (length = Integer.parseInt(line.getOptionValue(LENGTH_SHORT_OPT))) <= 0) {
                throw new IllegalArgumentException(length + " must be a positive number ");
            }
            if (line.hasOption("w") && (min_bootstrap_words = Integer.parseInt(line.getOptionValue("w"))) < 5) {
                throw new IllegalArgumentException("minWords must be at least 5");
            }
            if (line.hasOption(HIDETAXON_SHORT_OPT)) {
                hideTaxon = true;
            }
        }
        catch (Exception e) {
            System.out.println("Command Error: " + e.getMessage());
            new HelpFormatter().printHelp(120, "LeaveOneOutTesterMain", "", options, "", true);
            return;
        }
        LeaveOneOutTesterMain tester = new LeaveOneOutTesterMain(trainTaxonFile, trainSeqFile, queryFile, outputFile, length, min_bootstrap_words, hideTaxon);
    }

    static {
        options.addOption(new Option(TRAIN_SEQFILE_SHORT_OPT, TRAIN_SEQFILE_LONG_OPT, true, TRAIN_SEQFILE_DESC));
        options.addOption(new Option(TRAIN_TAXONFILE_SHORT_OPT, TRAIN_TAXONFILE_LONG_OPT, true, "contains the hierarchical taxonomy information, taxon name and rank together is unique. \nThe format looks like the following: taxid*taxon name*parent taxid*depth*rank Note taxid, the parent taxid and depth should be in integer format. depth indicates the depth from the root taxon. Recommend removing duplicate seqeunces using command rmdupseq"));
        options.addOption(new Option(QUERYFILE_SHORT_OPT, QUERYFILE_LONG_OPT, true, QUERYFILE_DESC));
        options.addOption(new Option(OUTFILE_SHORT_OPT, OUTFILE_LONG_OPT, true, OUTFILE_DESC));
        options.addOption(new Option(LENGTH_SHORT_OPT, LENGTH_LONG_OPT, true, LENGTH_DESC));
        options.addOption(new Option("w", "minWords", true, "minimum number of words for each bootstrap trial. Default(maximum) is 1/8 of the words of each sequence. Minimum is 5"));
        options.addOption(new Option(HIDETAXON_SHORT_OPT, "hideTaxon", false, "If set, remove the lowest taxon where a query sequence originally labelled from the training set. Default only remove the query seq from training set"));
    }
}

