Kouhei Sutou
null+****@clear*****
Thu Mar 2 00:03:41 JST 2017
Kouhei Sutou 2017-03-02 00:03:41 +0900 (Thu, 02 Mar 2017) New Revision: 0caa210033635e8be9f69bea009ef2de87a24df5 https://github.com/ranguba/chupa-text-decomposer-html/commit/0caa210033635e8be9f69bea009ef2de87a24df5 Message: Support KOI8-R Modified files: lib/chupa-text/decomposers/html.rb test/test-html.rb Modified: lib/chupa-text/decomposers/html.rb (+5 -1) =================================================================== --- lib/chupa-text/decomposers/html.rb 2017-02-25 00:53:13 +0900 (2f6e3e8) +++ lib/chupa-text/decomposers/html.rb 2017-03-02 00:03:41 +0900 (42d1bef) @@ -74,7 +74,11 @@ module ChupaText charset = $2 normalize_charset(charset) else - guess_encoding_nkf(text) + if text.encoding != Encoding::ASCII_8BIT and text.valid_encoding? + text.encoding.to_s + else + guess_encoding_nkf(text) + end end end Modified: test/test-html.rb (+15 -3) =================================================================== --- test/test-html.rb 2017-02-25 00:53:13 +0900 (4713b97) +++ test/test-html.rb 2017-03-02 00:03:41 +0900 (a504a69) @@ -138,12 +138,12 @@ class TestHTML < Test::Unit::TestCase sub_test_case("detect") do def test_nothing - @data.body = <<-HTML + @data.body = <<-HTML.force_encoding("UTF-8") <html> <body>Hello</body> </html> HTML - assert_equal([Encoding::US_ASCII], decompose(@data)) + assert_equal([Encoding::UTF_8], decompose(@data)) end def test_xml_declaration @@ -195,7 +195,7 @@ class TestHTML < Test::Unit::TestCase assert_equal([Encoding::ISO_2022_JP], decompose(@data)) end - def test_utf32 + def test_utf_32 @data.body = <<-UTF_32_HTML.encode("UTF-32") <html> <head> @@ -206,6 +206,18 @@ class TestHTML < Test::Unit::TestCase UTF_32_HTML assert_equal([Encoding::UTF_32], decompose(@data)) end + + def test_koi8_r + @data.body = <<-KOI8_R_HTML.encode("KOI8-R") +<html> + <head> + <title>название</title> + </head> + <body>Hello</body> +</html> + KOI8_R_HTML + assert_equal([Encoding::KOI8_R], decompose(@data)) + end end end -------------- next part -------------- HTML����������������������������... Download