/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.indexer;

import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Collection;
import java.util.Locale;
import org.apache.commons.codec.binary.Base64;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.TaskInputOutputContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.indexer.IndexerOutputFormat;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.IndexingFilters;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.indexer.NutchIndexAction;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.metrics.ErrorTracker;
import org.apache.nutch.metrics.LatencyTracker;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class IndexerMapReduce
extends Configured {
    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    public static final String INDEXER_PARAMS = "indexer.additional.params";
    public static final String INDEXER_DELETE = "indexer.delete";
    public static final String INDEXER_NO_COMMIT = "indexer.nocommit";
    public static final String INDEXER_DELETE_ROBOTS_NOINDEX = "indexer.delete.robots.noindex";
    public static final String INDEXER_DELETE_SKIPPED = "indexer.delete.skipped.by.indexingfilter";
    public static final String INDEXER_SKIP_NOTMODIFIED = "indexer.skip.notmodified";
    public static final String URL_FILTERING = "indexer.url.filters";
    public static final String URL_NORMALIZING = "indexer.url.normalizers";
    public static final String INDEXER_BINARY_AS_BASE64 = "indexer.binary.base64";
    private static final NutchIndexAction DELETE_ACTION = new NutchIndexAction(null, 1);

    private static String normalizeUrl(String url, boolean normalize, URLNormalizers urlNormalizers) {
        if (!normalize) {
            return url;
        }
        String normalized = null;
        if (urlNormalizers != null) {
            try {
                normalized = urlNormalizers.normalize(url, "indexer");
                normalized = normalized.trim();
            }
            catch (Exception e) {
                LOG.warn("Skipping {}: {}", (Object)url, (Object)e);
                normalized = null;
            }
        }
        return normalized;
    }

    private static String filterUrl(String url, boolean filter, URLFilters urlFilters) {
        if (!filter) {
            return url;
        }
        try {
            url = urlFilters.filter(url);
        }
        catch (Exception e) {
            url = null;
        }
        return url;
    }

    public static void initMRJob(Path crawlDb, Path linkDb, Collection<Path> segments, Job job, boolean addBinaryContent) throws IOException {
        Configuration conf;
        block11: {
            conf = job.getConfiguration();
            if (crawlDb != null) {
                LOG.info("IndexerMapReduce: crawldb: {}", (Object)crawlDb);
                Path currentCrawlDb = new Path(crawlDb, "current");
                try {
                    if (currentCrawlDb.getFileSystem(conf).exists(currentCrawlDb)) {
                        FileInputFormat.addInputPath((Job)job, (Path)currentCrawlDb);
                        break block11;
                    }
                    LOG.warn("Ignoring crawlDb for indexing, no crawlDb found in path: {}", (Object)crawlDb);
                }
                catch (IOException e) {
                    LOG.warn("Failed to use crawlDb ({}) for indexing", (Object)crawlDb, (Object)e);
                }
            } else {
                LOG.info("IndexerMapReduce: no crawldb provided for indexing");
            }
        }
        for (Path segment : segments) {
            LOG.info("IndexerMapReduces: adding segment: {}", (Object)segment);
            FileInputFormat.addInputPath((Job)job, (Path)new Path(segment, "crawl_fetch"));
            FileInputFormat.addInputPath((Job)job, (Path)new Path(segment, "crawl_parse"));
            FileInputFormat.addInputPath((Job)job, (Path)new Path(segment, "parse_data"));
            FileInputFormat.addInputPath((Job)job, (Path)new Path(segment, "parse_text"));
            if (!addBinaryContent) continue;
            FileInputFormat.addInputPath((Job)job, (Path)new Path(segment, "content"));
        }
        if (linkDb != null) {
            LOG.info("IndexerMapReduce: linkdb: {}", (Object)linkDb);
            Path currentLinkDb = new Path(linkDb, "current");
            try {
                if (currentLinkDb.getFileSystem(conf).exists(currentLinkDb)) {
                    FileInputFormat.addInputPath((Job)job, (Path)currentLinkDb);
                } else {
                    LOG.warn("Ignoring linkDb for indexing, no linkDb found in path: {}", (Object)linkDb);
                }
            }
            catch (IOException e) {
                LOG.warn("Failed to use linkDb ({}) for indexing: {}", (Object)linkDb, (Object)StringUtils.stringifyException((Throwable)e));
            }
        }
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setJarByClass(IndexerMapReduce.class);
        job.setMapperClass(IndexerMapper.class);
        job.setReducerClass(IndexerReducer.class);
        job.setOutputFormatClass(IndexerOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NutchWritable.class);
        job.setOutputValueClass(NutchWritable.class);
    }

    public static class IndexerReducer
    extends Reducer<Text, NutchWritable, Text, NutchIndexAction> {
        private boolean skip = false;
        private boolean delete = false;
        private boolean deleteRobotsNoIndex = false;
        private boolean deleteSkippedByIndexingFilter = false;
        private boolean base64 = false;
        private IndexingFilters filters;
        private ScoringFilters scfilters;
        private boolean normalize = false;
        private boolean filter = false;
        private URLNormalizers urlNormalizers;
        private URLFilters urlFilters;
        private LatencyTracker indexLatencyTracker;
        private Counter deletedRobotsNoIndexCounter;
        private Counter deletedGoneCounter;
        private Counter deletedRedirectsCounter;
        private Counter deletedDuplicatesCounter;
        private Counter skippedNotModifiedCounter;
        private Counter deletedByIndexingFilterCounter;
        private Counter skippedByIndexingFilterCounter;
        private Counter indexedCounter;
        private ErrorTracker errorTracker;

        public void setup(Reducer.Context context) {
            Configuration conf = context.getConfiguration();
            this.filters = new IndexingFilters(conf);
            this.scfilters = new ScoringFilters(conf);
            this.delete = conf.getBoolean(IndexerMapReduce.INDEXER_DELETE, false);
            this.deleteRobotsNoIndex = conf.getBoolean(IndexerMapReduce.INDEXER_DELETE_ROBOTS_NOINDEX, false);
            this.deleteSkippedByIndexingFilter = conf.getBoolean(IndexerMapReduce.INDEXER_DELETE_SKIPPED, false);
            this.skip = conf.getBoolean(IndexerMapReduce.INDEXER_SKIP_NOTMODIFIED, false);
            this.base64 = conf.getBoolean(IndexerMapReduce.INDEXER_BINARY_AS_BASE64, false);
            this.normalize = conf.getBoolean(IndexerMapReduce.URL_NORMALIZING, false);
            this.filter = conf.getBoolean(IndexerMapReduce.URL_FILTERING, false);
            if (this.normalize) {
                this.urlNormalizers = new URLNormalizers(conf, "indexer");
            }
            if (this.filter) {
                this.urlFilters = new URLFilters(conf);
            }
            this.indexLatencyTracker = new LatencyTracker("nutch_indexer", "index_latency");
            this.initCounters(context);
        }

        private void initCounters(Reducer.Context context) {
            this.deletedRobotsNoIndexCounter = context.getCounter("nutch_indexer", "deleted_robots_noindex_total");
            this.deletedGoneCounter = context.getCounter("nutch_indexer", "deleted_gone_total");
            this.deletedRedirectsCounter = context.getCounter("nutch_indexer", "deleted_redirects_total");
            this.deletedDuplicatesCounter = context.getCounter("nutch_indexer", "deleted_duplicates_total");
            this.skippedNotModifiedCounter = context.getCounter("nutch_indexer", "skipped_not_modified_total");
            this.deletedByIndexingFilterCounter = context.getCounter("nutch_indexer", "deleted_by_indexing_filter_total");
            this.skippedByIndexingFilterCounter = context.getCounter("nutch_indexer", "skipped_by_indexing_filter_total");
            this.indexedCounter = context.getCounter("nutch_indexer", "indexed_total");
            this.errorTracker = new ErrorTracker("nutch_indexer", (TaskInputOutputContext<?, ?, ?, ?>)context);
        }

        public void cleanup(Reducer.Context context) throws IOException, InterruptedException {
            this.indexLatencyTracker.emitCounters((TaskInputOutputContext<?, ?, ?, ?>)context);
        }

        public void reduce(Text key, Iterable<NutchWritable> values, Reducer.Context context) throws IOException, InterruptedException {
            NutchIndexAction action;
            Inlinks inlinks = null;
            CrawlDatum dbDatum = null;
            CrawlDatum fetchDatum = null;
            Content content = null;
            ParseData parseData = null;
            ParseText parseText = null;
            for (NutchWritable val : values) {
                Writable value = val.get();
                if (value instanceof Inlinks) {
                    inlinks = (Inlinks)value;
                    continue;
                }
                if (value instanceof CrawlDatum) {
                    CrawlDatum datum = (CrawlDatum)value;
                    if (CrawlDatum.hasDbStatus(datum)) {
                        dbDatum = datum;
                        continue;
                    }
                    if (CrawlDatum.hasFetchStatus(datum)) {
                        if (datum.getStatus() == 38) continue;
                        fetchDatum = datum;
                        continue;
                    }
                    if (67 == datum.getStatus() || 65 == datum.getStatus() || 68 == datum.getStatus()) continue;
                    throw new RuntimeException("Unexpected status: " + datum.getStatus());
                }
                if (value instanceof ParseData) {
                    String robotsMeta;
                    parseData = (ParseData)value;
                    if (!this.deleteRobotsNoIndex || (robotsMeta = parseData.getMeta("robots")) == null || robotsMeta.toLowerCase(Locale.ROOT).indexOf("noindex") == -1) continue;
                    context.write((Object)key, (Object)DELETE_ACTION);
                    this.deletedRobotsNoIndexCounter.increment(1L);
                    return;
                }
                if (value instanceof ParseText) {
                    parseText = (ParseText)value;
                    continue;
                }
                if (value instanceof Content) {
                    content = (Content)value;
                    continue;
                }
                LOG.warn("Unrecognized type: {}", value.getClass());
            }
            if (this.delete && fetchDatum != null) {
                if (fetchDatum.getStatus() == 37 || dbDatum != null && dbDatum.getStatus() == 3) {
                    this.deletedGoneCounter.increment(1L);
                    context.write((Object)key, (Object)DELETE_ACTION);
                    return;
                }
                if (fetchDatum.getStatus() == 36 || fetchDatum.getStatus() == 35 || dbDatum != null && dbDatum.getStatus() == 5 || dbDatum != null && dbDatum.getStatus() == 4) {
                    this.deletedRedirectsCounter.increment(1L);
                    context.write((Object)key, (Object)DELETE_ACTION);
                    return;
                }
            }
            if (fetchDatum == null || parseText == null || parseData == null) {
                return;
            }
            if (this.delete && dbDatum != null && dbDatum.getStatus() == 7) {
                this.deletedDuplicatesCounter.increment(1L);
                context.write((Object)key, (Object)DELETE_ACTION);
                return;
            }
            if (this.skip && dbDatum != null && dbDatum.getStatus() == 6) {
                this.skippedNotModifiedCounter.increment(1L);
                return;
            }
            if (!parseData.getStatus().isSuccess() || fetchDatum.getStatus() != 33) {
                return;
            }
            long indexStart = System.currentTimeMillis();
            NutchDocument doc = new NutchDocument();
            doc.add("id", key.toString());
            Metadata metadata = parseData.getContentMeta();
            doc.add("segment", metadata.get("nutch.segment.name"));
            doc.add("digest", metadata.get("nutch.content.digest"));
            ParseImpl parse = new ParseImpl(parseText, parseData);
            float boost = 1.0f;
            try {
                boost = this.scfilters.indexerScore(key, doc, dbDatum, fetchDatum, parse, inlinks, boost);
            }
            catch (ScoringFilterException e) {
                this.errorTracker.incrementCounters(e);
                LOG.warn("Error calculating score {}: {}", (Object)key, (Object)e);
                return;
            }
            doc.setWeight(boost);
            doc.add("boost", Float.toString(boost));
            try {
                if (dbDatum != null) {
                    String urlString;
                    fetchDatum.setSignature(dbDatum.getSignature());
                    Text url = (Text)dbDatum.getMetaData().get((Object)Nutch.WRITABLE_REPR_URL_KEY);
                    if (url != null && (urlString = IndexerMapReduce.filterUrl(IndexerMapReduce.normalizeUrl(key.toString(), this.normalize, this.urlNormalizers), this.filter, this.urlFilters)) != null) {
                        url.set(urlString);
                        fetchDatum.getMetaData().put((Writable)Nutch.WRITABLE_REPR_URL_KEY, (Writable)url);
                    }
                }
                doc = this.filters.filter(doc, parse, key, fetchDatum, inlinks);
            }
            catch (IndexingException e) {
                LOG.warn("Error indexing {}: ", (Object)key, (Object)e);
                this.errorTracker.incrementCounters(e);
                return;
            }
            if (doc == null) {
                if (this.deleteSkippedByIndexingFilter) {
                    action = new NutchIndexAction(null, 1);
                    context.write((Object)key, (Object)action);
                    this.deletedByIndexingFilterCounter.increment(1L);
                } else {
                    this.skippedByIndexingFilterCounter.increment(1L);
                }
                return;
            }
            if (content != null) {
                String binary = this.base64 ? org.apache.commons.codec.binary.StringUtils.newStringUtf8((byte[])Base64.encodeBase64((byte[])content.getContent(), (boolean)false, (boolean)false)) : new String(content.getContent());
                doc.add("binaryContent", binary);
            }
            this.indexLatencyTracker.record(System.currentTimeMillis() - indexStart);
            this.indexedCounter.increment(1L);
            action = new NutchIndexAction(doc, 0);
            context.write((Object)key, (Object)action);
        }
    }

    public static class IndexerMapper
    extends Mapper<Text, Writable, Text, NutchWritable> {
        private boolean normalize = false;
        private boolean filter = false;
        private URLNormalizers urlNormalizers;
        private URLFilters urlFilters;

        public void setup(Mapper.Context context) {
            Configuration conf = context.getConfiguration();
            this.normalize = conf.getBoolean(IndexerMapReduce.URL_NORMALIZING, false);
            this.filter = conf.getBoolean(IndexerMapReduce.URL_FILTERING, false);
            if (this.normalize) {
                this.urlNormalizers = new URLNormalizers(conf, "indexer");
            }
            if (this.filter) {
                this.urlFilters = new URLFilters(conf);
            }
        }

        public void map(Text key, Writable value, Mapper.Context context) throws IOException, InterruptedException {
            String urlString = IndexerMapReduce.filterUrl(IndexerMapReduce.normalizeUrl(key.toString(), this.normalize, this.urlNormalizers), this.filter, this.urlFilters);
            if (urlString == null) {
                return;
            }
            key.set(urlString);
            context.write((Object)key, (Object)new NutchWritable(value));
        }
    }
}

