#!/usr/bin/env python import io import sys import chardet s = bytearray() reader = io.open(sys.stdin.fileno()) need_transform_lines = [] all_lines = [] line_need_transform = False while True: c = reader.read(1) if not c: break if c == '\n': s = bytes(s) if line_need_transform: need_transform_lines.append(s) all_lines.append(None) else: all_lines.append(s) line_need_transform = False s = bytearray() continue if 0x80 <= ord(c) <= 0xff: # c is like b'\xff\xfe\x??\x00' (for little endian) s.append(c.encode('utf-16')[2]) line_need_transform = True else: s += c.encode('utf-8') chardet_result = chardet.detect(b'\n'.join(need_transform_lines)) encoding = chardet_result['encoding'] sys.stderr.write('Encoding: %s with confidence %.2f\n\n' % ( encoding, chardet_result['confidence'])) for line in all_lines: if line is None: sys.stdout.write(need_transform_lines[0].decode(encoding)) need_transform_lines.pop(0) else: sys.stdout.write(line.decode('utf-8')) sys.stdout.write('\n')