/*
 * $Id: DefaultContentsAnalyzer.java,v 1.6 2006/02/19 09:26:22 akabane Exp $
 * LOGICAL-PARADOX.ORG
 * Copyright (C)2005 satoshi akabane(akabane@logical-paradox.org)
 *
 */
package org.logical_paradox.rss.http.contents;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;

import javax.swing.text.html.HTMLEditorKit.Parser;

import org.logical_paradox.common.util.StringUtils;
import org.logical_paradox.rss.http.ParserGetter;
import org.logical_paradox.rss.http.RSSHttpResponse;
import org.logical_paradox.rss.http.WebContents;

/**
 * ftHgRecAiCUD
 * TCg̏ڍׂ܂ł͉͂Ȃ^CṽAiCUD<br>
 * TCg^CgURLȂǂ^f[^ƂĎ擾xD
 * @author satoshi akabane@logical-paradox.org
 * @version $Revision: 1.6 $
 */
public class DefaultContentsAnalyzer implements ContentsAnalyzer {
	/** HTMLRectB^ */
	private static final ContentsFilter htmlContentsFilter = new HTMLContentsFilter();

	/**
	 * webRec͂Cʂ^f[^ƂĕԂD
	 * @param response HTTPX|XIuWFNg
	 * @param contents HTMLhLg
	 * @return ^f[^
	 */
	public WebContents analyze(RSSHttpResponse response, String contents) {
		WebContents wc = new WebContents();

		if(response.getResponseCode() < 0) {
			throw new IllegalStateException("HttpURLConnection object has no content");
		}

		// IWĩRec(HTML)
		wc.setOriginalDocument(contents);
		// URL
		wc.setURL(response.getURL());
		// Rec̈ʒu
		wc.setLocation(response.getURL().toString());
		// mime^Cv
		wc.setContentType(response.getContentType());
		// httpXe[^XR[h
		wc.setResponseCode(response.getResponseCode());
		// URLrobots.txtĂꍇ̓tOZbg
		if(wc.getLocation().indexOf("robots.txt") >= 0) {
			wc.setRobotsTxtFlg(true);
		} else {
			wc.setRobotsTxtFlg(false);
		}

		// eLXgꂽhLg
		wc.setDocument(normalizeDocument(contents));
		// TCg
		wc.setSitename(getSiteName(contents));
		// hLg^Cg(Ȃꍇ͒PURLƂ)
		String title = getTitle(contents); 
		wc.setTitle(StringUtils.isEmpty(title) ? wc.getLocation() : title);

		return wc;
	}
	/**
	 * Rec͂āC^ÔȂPȂeLXgɕϊD
	 * @param contents ̃Rec
	 * @return ϊꂽRec
	 */
	protected String normalizeDocument(String contents) {
		return StringUtils.isEmpty(contents) == true ? "" : htmlContentsFilter.filter(contents);
	}
	/**
	 * Rec͂āCTCgԂD
	 * @param contents ̃Rec
	 * @return oꂽTCg
	 */
	protected String getSiteName(String contents) {
		// Ƃ肠^CgƓ̂ԂĂ
		return getTitle(contents);
	}
	/**
	 * Rec͂āChLg^CgԂD
	 * @param contents ̃Rec
	 * @return oꂽhLg^Cg
	 */
	protected String getTitle(String contents) {
		if(StringUtils.isEmpty(contents)) {
			return "";
		}
		Parser parser = new ParserGetter().getParser();
		DocumentTitleCollectionCallback cb = new DocumentTitleCollectionCallback();
		BufferedReader br = new BufferedReader(
								new InputStreamReader(new ByteArrayInputStream(contents.getBytes()))
							);
		try {
			// 
			parser.parse(br, cb, true);
			String title = cb.getDocumentTitle(); 
			return StringUtils.isEmpty(title) ? "" : title;
		} catch (IOException e) {
			// ǂ悤Ȃ̂œKȂ̂ԂĂ
			return "";
		}
	}
}
