大人のおもちゃ箱: 2月 2020

2020年2月9日日曜日
SKKの辞書をCSVに変換してみた

SKKの辞書をCSVファイルに変換するためのPythonスクリプトを作ってみました．

ここにある辞書を落として来て，Shift-JISのCSVファイルで出力しました．CSVのライブラリーは使ってません．
#!python3
# 2020/2/9
# 実行方法: ./SKKtoCSV.py SKK-JISYO.file
# 語尾に.csvをつけたファイル名にSJISで出力

# SKK-JISYOをCSVに変換
# EUC-JP -> SJIS
# 先頭;がコメント行
# 先頭>は語尾
# 先頭#は数字の置き換え
# よみ /漢字1;コメント,コメント/漢字2/...
# よみの最後が>もしくはローマ字の場合がある

import sys,os,re
import codecs

def processLine( line, fout ):
    re_pattern_line = '^(\S+) /(.+/)$'
    #re_pattern_line = '^(\S+) /(\S+/)+$'
    re_pattern_kana = '^(.+)([a-z>])$'
    re_pattern_entry = '^(.+);(.+)$'

    print('[{}]'.format(line))
    sys.stdout.flush()
    if line == '':
        print('  -> ignore 1')
        return
    if line[0] == ';':
        print('  -> ignore 2')
        return
    if line[0] == '>':
        print('  -> ignore 3')
        return
    if line[0] == '#':
        print('  -> ignore 4')
        return
    # 処理する
    re_result = re.match(re_pattern_line, line)
    if re_result == None:
        print('  -> None')
        return

    print('  -> re result: {}, {}'.format(re_result.group(1),
                                          re_result.group(2)))
    sys.stdout.flush()
    # よみ
    kana = re_result.group(1)
    # 漢字/漢字/.../漢字/
    kanjistr = re_result.group(2)

    print('  yomi: {}'.format(kana))
    sys.stdout.flush()
    # 最後が子音ローマ字もしくは'>'を除く必要あり
    re_result2 = re.match(re_pattern_kana, kana)
    if re_result2:
        print('  end [{}][{}]'.format(re_result2.group(1),
                                      re_result2.group(2)))
        print('  -> ignore 5')
        sys.stdout.flush()
        return
    # ['漢字', '漢字', ..., '漢字', '']
    kanjilist = kanjistr.split('/')
    print('  kanjilist: {}'.format(kanjilist))
    if len(kanjilist) >= 10:
        print('  (many entry)')
    sys.stdout.flush()
    # ['漢字', '漢字', ..., '漢字', '']
    kanjilist = kanjilist[0:-1]
    print('  -> kanjilist: {}'.format(kanjilist))
    for kanji in kanjilist:
        # 注意: "漢字;こめんと,こめんと"の場合あり
        print('    kanji: {}'.format(kanji))
        sys.stdout.flush()
        re_result3 = re.match(re_pattern_entry, kanji)
        if re_result3:
            print('  comment:{}[{}]'.format(re_result3.group(1),
                                            re_result3.group(2)))
            print('  output: {},{}'.format(kana,
                                           re_result3.group(1)))
            fout.write('"{}","{}",\n'.format(kana, re_result3.group(1)))
        else:
            print('  output: {},{}'.format(kana, kanji))
            fout.write('"{}","{}",\n'.format(kana, kanji))

def SKKtoCSV( fileNameIn ):
    print('processing {}'.format(fileNameIn))
    fin = codecs.open(fileNameIn, 'r', 'euc_jp')
    dataAll = fin.read()
    linesAll = dataAll.split('\n')
    fin.close()
    fileNameOut = fileNameIn + '.csv'
    print('output file name is {}'.format(fileNameOut))
    fout = codecs.open(fileNameOut, 'w', 'shift_jis')
    
    for line in linesAll:
        processLine( line, fout )
    fout.close()

##------
if __name__ == '__main__':
    args = sys.argv
    if len(args) != 2:
        print('{} SKK-JISYO'.format(args[0]))
        sys.exit()
    SKKtoCSV( args[1] )