# Copyright (C) 2005  Network Applied Communication Laboratory Co., Ltd.
#
# This file is part of Rast.
# See the file COPYING for redistribution information.
#

require "htree"
require File.join(File.dirname(__FILE__), "read-buckets-to-buffer")

class TextHtml
  SUPPORTED_VERSION = 1
  MIME_TYPE = "text/html"
  ENCODINGS = ["UTF-8", "EUC-JP"]

  include ReadBucketsToBuffer

  private

  def process_buffer(filter, mime_type)
    db_encoding = filter.db_encoding

    space = Rast::EncodingConverter.convert_encoding("US-ASCII", db_encoding,
                                                     " ")
    buf = @buffer.gsub("&nbsp;", space)
    tree = HTree.parse(buf)

    input_encoding = get_input_encoding(tree)

    title = tree.title
    if title
      s = Rast::EncodingConverter.convert_encoding(input_encoding, db_encoding,
                                                   title.to_s)
      filter.set_property("title", s)
    end

    author = ""
    tree.traverse_element("{http://www.w3.org/1999/xhtml}meta") do |elem|
      name = elem.get_attribute("name")
      content = elem.get_attribute("content")
      if name && content
        converted_name =
          Rast::EncodingConverter.convert_encoding(input_encoding,
                                                   db_encoding, name.to_s)
        converted_content =
          Rast::EncodingConverter.convert_encoding(input_encoding,
                                                   db_encoding, content.to_s)
        filter.set_property(converted_name, converted_content)
      end
      if converted_name == "author"
        author.concat(converted_content)
        author.concat(" ")
      end
    end

    tree.traverse_element("{http://www.w3.org/1999/xhtml}link") do |elem|
      href = elem.get_attribute("href")
      rev = elem.get_attribute("rev")
      if href && rev && rev.to_s == "made"
        href_str = href.to_s
        if (match_data = /mailto:(.*)/.match(href_str))
          href_str = match_data[1]
        end
        filter.set_property("author", href_str)
        author.concat(href_str)
        author.concat(" ")
      end
    end

    body = tree.find_element("{http://www.w3.org/1999/xhtml}body")

    if (address = body.find_element("{http://www.w3.org/1999/xhtml}address"))
      s = address.extract_text.to_s.strip
      s = Rast::EncodingConverter.convert_encoding(input_encoding, db_encoding,
                                                   s)
      author.concat(s)
      filter.set_property("author", author)
    end

    s = body.extract_text.to_s
    s = Rast::EncodingConverter.convert_encoding(input_encoding, db_encoding,
                                                 s)
    bucket = Rast::TransientBucket.new(s)
    next_brigade = Rast::Brigade.new
    next_brigade.insert_tail(bucket)
    next_brigade.insert_tail(Rast::EOSBucket.new)
    filter.pass(next_brigade, "text/plain")
  end

  def get_input_encoding(tree)
    tree.traverse_element("{http://www.w3.org/1999/xhtml}meta") do |elem|
      if "content-type" == elem.get_attribute("http-equiv").to_s.downcase
        content = elem.get_attribute("content")
        if (match_data = /;\s+charset=([^;]+)/i.match(content.to_s))
          return match_data[1]
        end
      end
    end

    return Rast::EncodingConverter.guess(@buffer, Rast::JAPANESE_ENCODINGS)
  end
end
