CCL Home Page
Up Directory CCL koi8-tex
# koi8-plaintex.rus - KOI8 to PlainTeX conversion for translit
#
# version 1.0
#
# Created by Alexander L. Belikoff, 1997

# The TeX tranliteration sequences follow AMS cyrillic convention for
# WNCYR fonts with cyracc.def file
# To be used with translit.c program by Jan Labanowski. For a format of
# this file consult translit documentation
# Processed the KOI8 encoced file by translit. For example:
#   translit -i myfile.ko8 -o myfile.tex -t koi8-tex.rus
# Then process it with Plain TeX by using:
#   tex myfile.tex
# and then use your favorite dvi2something program. E.g., for PostScript use:
#   dvips myfile.dvi
#

   1            file version number

   "    "      # string delimiters
   [    ]      # list delimites
   {    }      # regular expression delimiters



# starting sequence for TeX
"\input cyracc.def
\font\tencyr=wncyr10
\def\cyr{\tencyr\cyracc}
\obeylines
\obeyspaces
"

#ending sequence
"
\bye
"

   0     # number of input SHIFT sequences, only one set of input characters

   2     # number of output SHIFT sequences, two sets of input characters
  
# SHIFT-OUT      SHIFT-IN
    ""             ""            #shift sequences for set 1  (Latin) 
    "{\cyr "       "}"           #cyrillic enclosed in {\cyr ... }

# conversion table
# inp_set     inp_seq           out_set    out_seq


# characters which are not in ASCII (and DEL) and not in KOI8 to *
     0 [\0x7F-\0xA2\0xA4-\0xB2\0xB4-\0xBF]   0    "$\star$"

# dehyphenate words, e.g.  con-   (NL)cert  is changed to concert(NL)
# Below is a complicated (?) regular expression. It joins a hyphenated
# word. It looks for one of more letters (saves them as substring 1)
# followed by a hyphen (which may be followed by zero or more spaces
# or tabs). The hyphen must be followed by a NewLine (characters 0A-0D hex
# are various new line sequences) and saves NewLine sequence. Then it looks
# for zero or more tabs and spaces (at the beginning of the line). Then it
# looks for the rest of the hyphenated word and saves it as substring 3.
# The word may have punctuation attached. Then it looks again for some spaces
# or tabs. The substitute string junks all sequences which were not withn (),
# i.e., hyphen and spaces/tabs and inserts only substrings but in a different
# order. The 1 (word beginning) is followed by 3 (word end) and followed by
# the NewLine. The {\2\1\3} would be equally good. The string is then returned
# back for processing (output code is -1).  Note that since input regular
# expression is very long, I chopped it into several lines by using \NL.
# If \ is followed by a white space, the \ and all white space which follow it
# is removed by the program. Be carefull not to use "\white_space" in strings,
# lists or regular expressions. If you must, enter \ as a code (i.e., \0x5C).

# uncomment lines below if you want to dehyphenate

# 0 {([A-Za-z\0xA3\0xB3\0xC0-\0xFF]+)-[ \0x09]*([\0x0A-\0x0D]+)[ \0x09]*(\
#   [A-Za-z\0xA3\0xB3\0xC0-\0xFF,.?;:")'`!]+)[ \0x09]}    
#                                   -1      {\1\3\2}

# All latin letters are converted to the same letters but with the output 
# set 1
     0      [A-Za-z]                1    [A-Za-z]    #Latin letters A-Z and a-z

# Add \medskip before the blank lines
      0 {([\0x0B-\0x0D]*)\0x0A[ \0x09]*\0x0A([\0x0B-\0x0D]*)} 0 {\\medskip\1\0x0A\2}

# Quote some special TeX characters

#  these do not require going out of {\cyr ....}
     0         "["                  0       "$[$"
     0         "]"                  0       "$]$"
     0         "^"                  0       "$\wedge$"
     0         "{"                  0       "$\lbrace$"
     0         "}"                  0       "$\rbrace$"
     0         "~"                  0       "$\sim$"
     0         "\"                  0       "$\backslash$"
     0         "|"                  0       "$\mid$"
     0         "*"                  0       "$\star$"
     0         "<"                  0       "$<$"
     0         ">"                  0       "$>$"
     0         "$"                  0       "\$" 
     0         "%"                  0       "\%"

#  these can be represented correctly only in Latin charset
     0         "_"                  1       "\_"
     0         "&"                  1       "\&"
     0         "#"                  1       "\#"
     0         "@"                  1       "@"
     
# Cyrillic letters  
     0         "\0xF4\0xFD"         2       "T{\cydot}Shch" # to prevent C
     0         "\0xF4\0xDD"         2       "T{\cydot}shch" # to prevent C
     0         "\0xD4\0xFD"         2       "t{\cydot}Shch" # to prevent C
     0         "\0xD4\0xDD"         2       "t{\cydot}shch" # to prevent C

     0         "\0xF4\0xFB"         2       "T{\cydot}Sh"   # to prevent C
     0         "\0xF4\0xDB"         2       "T{\cydot}sh"   # to prevent C
     0         "\0xD4\0xFB"         2       "t{\cydot}Sh"   # to prevent C
     0         "\0xD4\0xDB"         2       "t{\cydot}sh"   # to prevent C

     0         "\0xF4\0xF3"         2       "T{\cydot}S" # to prevent C 
     0         "\0xF4\0xD3"         2       "T{\cydot}s" # to prevent C 
     0         "\0xD4\0xF3"         2       "t{\cydot}S" # to prevent c
     0         "\0xD4\0xD3"         2       "t{\cydot}s" # to prevent c 

     0         "\0xA3"              2       "\\0o42e"    # small \"e (yo)
     0         "\0xB3"              2       "\\0o42E"    # capital \"E (Yo)
     0         "\0xE1"              2       "A"             
     0         "\0xE2"              2       "B"             
     0         "\0xF7"              2       "V"             
     0         "\0xE7"              2	    "G"             
     0         "\0xE4"              2       "D"             
     0         "\0xE5"              2       "E"             
     0         "\0xF6"              2       "Zh"            
     0         "\0xFA"              2       "Z"             
     0         "\0xE9"              2       "I"             
     0         "\0xEA"              2       "{\u I}"       # I kratkoje 
     0         "\0xEB"              2       "K"             
     0         "\0xEC"              2       "L"             
     0         "\0xED"              2       "M"             
     0         "\0xEE"              2       "N"             
     0         "\0xEF"              2       "O"             
     0         "\0xF0"              2       "P"             
     0         "\0xF2"              2       "R"             
     0         "\0xF3"              2       "S"             
     0         "\0xF4"              2       "T"             
     0         "\0xF5"              2       "U"             
     0         "\0xE6"              2       "F"             
     0         "\0xE8"              2       "Kh"            
     0         "\0xE3"              2       "C"             
     0         "\0xFE"              2       "Ch"            
     0         "\0xFB"              2       "Sh"            
     0         "\0xFD"              2       "Shch"          
     0         "\0xFF"              2       "{\Cdprime}"   # Tverdyj znak 
     0         "\0xF9"              2       "Y"             
     0         "\0xF8"              2       "{\Cprime}"    # Myagkij znak 
     0         "\0xFC"              2       "\`E"           
     0         "\0xE0"              2       "Yu"            
     0         "\0xF1"              2       "Ya"            
     0         "\0xC1"              2       "a"             
     0         "\0xC2"              2       "b"             
     0         "\0xD7"              2       "v"             
     0         "\0xC7"              2       "g"             
     0         "\0xC4"              2       "d"             
     0         "\0xC5"              2       "e"             
     0         "\0xD6"              2       "zh"            
     0         "\0xDA"              2       "z"             
     0         "\0xC9"              2       "i"             
     0         "\0xCA"              2       "{\u i}"             
     0         "\0xCB"              2       "k"             
     0         "\0xCC"              2       "l"             
     0         "\0xCD"              2       "m"             
     0         "\0xCE"              2       "n"             
     0         "\0xCF"              2       "o"             
     0         "\0xD0"              2       "p"             
     0         "\0xD2"              2       "r"             
     0         "\0xD3"              2       "s"             
     0         "\0xD4"              2       "t"             
     0         "\0xD5"              2       "u"             
     0         "\0xC6"              2       "f"             
     0         "\0xC8"              2       "kh"            
     0         "\0xC3"              2       "c"             
     0         "\0xDE"              2       "ch"            
     0         "\0xDB"              2       "sh"            
     0         "\0xDD"              2       "shch"          
     0         "\0xDF"              2       "{\cdprime}"    
     0         "\0xD9"              2       "y"             
     0         "\0xD8"              2       "{\cprime}"     
     0         "\0xDC"              2       "\`e"           
     0         "\0xC0"              2       "yu"            
     0         "\0xD1"              2       "ya"            
Modified: Wed Jan 22 17:00:00 1997 GMT
Page accessed 7374 times since Sat Apr 17 21:33:26 1999 GMT