#!/usr/bin/ruby # (c) Yoann Guillot 2009 # License: GPL class UnUTF8 # wether invalid sequences are discarded or passed through attr_accessor :mixed, :utf8 def initialize(str, mixed=true) @utf8 = str @mixed = mixed end # yields codepoints def each_char curchar = nil curlen = 0 linecnt = 1 invalid = proc { |b| raw = b ? b.chr : '' while curchar.to_i > 0x3f raw << (0x80 | (curchar & 0x3f)) curchar >>= 6 end len = raw.length + curlen len += 1 if not b raw << (((0xff << (8-len)) & 0xff) | curchar) if curlen > 0 raw.reverse! if @mixed $stderr.puts "invalid utf8 sequence #{raw.unpack('H*').first} line #{linecnt}" if $DEBUG raw.each_byte { |bb| yield bb } else $stderr.puts "invalid utf8 sequence #{raw.unpack('H*').first} line #{linecnt}" end curlen = 0 } @utf8.each_byte { |b| if curlen > 0 if b & 0xc0 != 0x80 invalid[b] else curchar <<= 6 curchar |= b & 0x3f curlen -= 1 yield curchar if curlen == 0 end else if b & 0x80 == 0 yield b linecnt += 1 if b == ?\n elsif b & 0xe0 == 0xc0 curlen = 1 curchar = b & 0x1f elsif b & 0xf0 == 0xe0 curlen = 2 curchar = b & 0x0f elsif b & 0xf8 == 0xf0 curlen = 3 curchar = b & 0x07 else invalid[b] end if curlen > 0 and curchar < ((curlen == 1) ? 2 : 1) # could be encoded with less bytes invalid[nil] end end } if curlen > 0 invalid[nil] end end def to_s chr = [] each_char { |c| chr << (c < 256 ? c : ?.) } chr.pack('C*') end end if $0 == __FILE__ mixed = true mixed = true if ARGV.delete '--mixed' # pass through invalid sequences mixed = false if ARGV.delete '--strict' # abort on invalid sequence, ignore >2b sequences (replace by '.') html = true if ARGV.delete '--html' outbuf = '' if ARGV.first and ARGV.first.include? '://' require 'open-uri' fd = open(ARGV.first) else fd = ARGF end UnUTF8.new(fd, mixed).each_char { |c| if html c = '&#x%x;' % c if c > 0x7f else c = '.' if c > 0xff end outbuf << c if outbuf.length > 4096 print outbuf outbuf = '' end } print outbuf $stdout.flush end