%!PS-Adobe-2.0 %%Title: a.mss %%DocumentFonts: (atend) %%Creator: Frank da Cruz,718W,0000, and Scribe 7(1700) %%CreationDate: 5 July 1999 20:57 %%Pages: (atend) %%EndComments % PostScript Prelude for Scribe. /BS {/SV save def 0.0 792.0 translate .01 -.01 scale} bind def /ES {showpage SV restore} bind def /SC {setrgbcolor} bind def /FMTX matrix def /RDF {WFT SLT 0.0 eq {SSZ 0.0 0.0 SSZ neg 0.0 0.0 FMTX astore} {SSZ 0.0 SLT neg sin SLT cos div SSZ mul SSZ neg 0.0 0.0 FMTX astore} ifelse makefont setfont} bind def /SLT 0.0 def /SI { /SLT exch cvr def RDF} bind def /WFT /Courier findfont def /SF { /WFT exch findfont def RDF} bind def /SSZ 1000.0 def /SS { /SSZ exch 100.0 mul def RDF} bind def /AF { /WFT exch findfont def /SSZ exch 100.0 mul def RDF} bind def /MT /moveto load def /XM {currentpoint exch pop moveto} bind def /UL {gsave newpath moveto dup 2.0 div 0.0 exch rmoveto setlinewidth 0.0 rlineto stroke grestore} bind def /LH {gsave newpath moveto setlinewidth 0.0 rlineto gsave stroke grestore} bind def /LV {gsave newpath moveto setlinewidth 0.0 exch rlineto gsave stroke grestore} bind def /BX {gsave newpath moveto setlinewidth exch dup 0.0 rlineto exch 0.0 exch neg rlineto neg 0.0 rlineto closepath gsave stroke grestore} bind def /BX1 {grestore} bind def /BX2 {setlinewidth 1 setgray stroke grestore} bind def /PB {/PV save def newpath translate 100.0 -100.0 scale pop /showpage {} def} bind def /PE {PV restore} bind def /GB {/PV save def newpath translate rotate div dup scale 100.0 -100.0 scale /showpage {} def} bind def /GE {PV restore} bind def /FB {dict dup /FontMapDict exch def begin} bind def /FM {cvn exch cvn exch def} bind def /FE {end /original-findfont /findfont load def /findfont {dup FontMapDict exch known{FontMapDict exch get} if original-findfont} def} bind def /BC {gsave moveto dup 0 exch rlineto exch 0 rlineto neg 0 exch rlineto closepath clip} bind def /EC /grestore load def /SH /show load def /MX {exch show 0.0 rmoveto} bind def /W {0 32 4 -1 roll widthshow} bind def /WX {0 32 5 -1 roll widthshow 0.0 rmoveto} bind def /RC {100.0 -100.0 scale 612.0 0.0 translate -90.0 rotate .01 -.01 scale} bind def /URC {100.0 -100.0 scale 90.0 rotate -612.0 0.0 translate .01 -.01 scale} bind def /RCC {100.0 -100.0 scale 0.0 -792.0 translate 90.0 rotate .01 -.01 scale} bind def /URCC {100.0 -100.0 scale -90.0 rotate 0.0 792.0 translate .01 -.01 scale} bind def % Generate encodings for accent fonts. Taken from the Postscript "Cookbook" /reencodedict 6 dict def /ReEncode { reencodedict begin /newencoding exch def /newfontname exch def /basefontname exch def /basefontdict basefontname findfont def /newfont basefontdict maxlength dict def basefontdict {exch dup /FID ne dup /Encoding ne and { exch newfont 3 1 roll put } { pop pop } ifelse } forall newfont /FontName newfontname put newfont /Encoding newencoding put newfontname newfont definefont pop end } def /Stuff { exch 1 add dup Accent 4 2 roll exch put } def /Accent 256 array def 0 1 255 { Accent exch /.notdef put } for 8#040 [ /exclamdown /dotlessi /AE /OE /Oslash /Lslash /acute /guillemotleft /guillemotright /macron /cent /sterling /currency /yen /bar /section /dieresis /hungarumlaut] { Stuff } forall pop 8#062 [ /ae /oe /oslash /thorn /lslash /ordfeminine /degree /ordmasculine /paragraph /periodcentered /florin /ogonek] { Stuff } forall pop 8#076 [ /questiondown /caron] { Stuff } forall pop 8#100 [ /Aacute /Acircumflex /Adieresis /Agrave /Aring /Atilde /Ccedilla /Eacute /Ecircumflex /Edieresis /Egrave /Iacute /Icircumflex /Idieresis /Igrave /Ntilde /Oacute /Ocircumflex /Odieresis /Ograve /Otilde /Scaron /Uacute /Ucircumflex /Udieresis /Ugrave /Ydieresis /Zcaron /Eth /Thorn /germandbls /breve] { Stuff } forall pop 8#140 [ /aacute /acircumflex /adieresis /agrave /aring /atilde /ccedilla /eacute /ecircumflex /edieresis /egrave /iacute /icircumflex /idieresis /igrave /ntilde /oacute /ocircumflex /odieresis /ograve /otilde /scaron /uacute /ucircumflex /udieresis /ugrave /ydieresis /zcaron /eth /cedilla] { Stuff } forall pop 8#177 [ /a /b /c /d /e /f /g /h /i /j /k /l /m /n /o /p /q /r /s /t /u /v /w /x /y /z /A /B /C /D /E /F /G /H /I /J /K /L /M /N /O /P /Q /R /S /T /U /V /W /X /Y /Z /hungarumlaut /grave /breve /circumflex /tilde /macron /breve /dotaccent /dieresis /ring /cedilla /hungarumlaut /ogonek /caron] { Stuff } forall pop /Times-Roman /Times-Accent Accent ReEncode /Times-Bold /Times-Bold-Accent Accent ReEncode /Times-Italic /Times-Italic-Accent Accent ReEncode /Helvetica /Helvetica-Accent Accent ReEncode /Helvetica-Bold /Helvetica-Bold-Accent Accent ReEncode /Helvetica-Oblique /Helvetica-Italic-Accent Accent ReEncode /Helvetica-Narrow /Helvetica-Narrow-Accent Accent ReEncode /Helvetica-Narrow-Bold /Helvetica-Narrow-Bold-Accent Accent ReEncode /Helvetica-Narrow-Oblique /Helvetica-Narrow-Italic-Accent Accent ReEncode /AvantGarde-Book /Avantgarde-Accent Accent ReEncode /AvantGarde-BookOblique /Avantgarde-Italic-Accent Accent ReEncode /AvantGarde-Demi /Avantgarde-Bold-Accent Accent ReEncode /Courier /Courier-Accent Accent ReEncode /Courier-Bold /Courier-Bold-Accent Accent ReEncode /Courier-Oblique /Courier-Italic-Accent Accent ReEncode /Palatino-Roman /Palatino-Accent Accent ReEncode /Palatino-Bold /Palatino-Bold-Accent Accent ReEncode /Palatino-Italic /Palatino-Italic-Accent Accent ReEncode /Bookman-Light /Bookman-Accent Accent ReEncode /Bookman-Demi /Bookman-Bold-Accent Accent ReEncode /Bookman-LightItalic /Bookman-Italic-Accent Accent ReEncode /ZapfChancery-MediumItalic /Chancery-Accent Accent ReEncode /NewCenturySchlbk-Roman /NewCenturySchlbk-Accent Accent ReEncode /NewCenturySchlbk-Bold /NewCentury-Bold-Accent Accent ReEncode /NewCenturySchlbk-Italic /NewCentury-Italic-Accent Accent ReEncode /Garamond-Light /Garamond-Accent Accent ReEncode /Garamond-Bold /Garamond-Bold-Accent Accent ReEncode /Garamond-LightItalic /Garamond-Italic-Accent Accent ReEncode %%EndProlog %%Page: 1 1 BS 0 SI 15.5 /Times-Bold AF 16133 8239 MT (Interchange of Non-English Computer Text)SH 10.5 /Times-Roman AF 27596 10524 MT (Frank da Cruz)SH 23716 12824 MT (Columbia University, New York)SH 29550 13974 MT (1994)SH 15.5 /Times-Bold AF 7200 18613 MT (Introduction)SH 10.5 /Times-Roman AF 7200 19910 MT (Thirty years ago,) 33 W( computers and people communicated using a small repertoire of symbols, digits, and Roman)32 W 7200 21207 MT (letters. Often) 375 W( the letters were only in uppercase, and there were no) 56 W( accents. The language of computing was)57 W 7200 22504 MT (exclusively English. Today, we are poised on the brink of) 189 W( a worldwide computer-based communications)188 W 7200 23801 MT (revolution, with a single character set encompassing all the world's writing systems.)SH 8850 25098 MT (In the intervening decades, we have contrived a vast) 100 W( Babel of mutually incomprehensible character sets,)101 W 7200 26395 MT (both proprietary and standard. The new Universal Character Set, ISO 10646) 34 W( [19],) SH( offers a) 34 W( single common en-)33 W 7200 27692 MT (coding for all writing systems. But radical and) 227 W( massive changes are required in data entry and display)228 W 7200 28989 MT (hardware as well as in computer software) 39 W( and data files at all levels from operating system to application, and)38 W 7200 30286 MT (could therefore take decades to see widespread use. In the meantime, how shall we survive our Babel?)SH 8850 31583 MT (The problem is immediately apparent to anyone) 13 W( who tries to transfer non-English textual data between two)14 W 7200 32880 MT (different kinds of computers using conventional methods. `)152 W 33430 XM (`P)SH /Times-Accent SF (b)SH /Times-Roman SF (t)SH /Times-Accent SF (h)SH /Times-Roman SF (')SH 35838 XM (' on an) 152 W( IBM PC becomes `)151 W 47905 XM (`P)SH /Times-Accent SF (H)SH /Times-Roman SF (t)SH /Times-Accent SF (c)SH /Times-Roman SF (')SH 50489 XM (' on the)151 W 7200 34177 MT (Macintosh, and a truckload of pocket-bread is delivered instead of goose liver.)SH 8850 35474 MT (This paper presents a simplified and condensed description of the character-set translation) 326 W( method)327 W 7200 36771 MT (developed for the Kermit file transfer protocol. The lessons learned should be useful in) 58 W( any arena where text)57 W 7200 38068 MT (must be transmitted meaningfully between unlike computer systems. Familiarity with the standards process in)24 W 7200 39365 MT (general, and the) 51 W( US ASCII) 50 W( [1],) SH( ISO 646) 50 W( [14],) SH( and ISO 8859) 50 W( [18]) SH( character set standards is assumed, and with)50 W 7200 40662 MT (ISO Standards 2022 [15] and 4873 [17], as well as with proprietary character sets such as IBM PC code pages.)SH 15.5 /Times-Bold AF 7200 43266 MT (Types of Character Sets)SH 10.5 /Times-Roman AF 7200 44563 MT (Today's coded character sets can be) 145 W( classified along several axes: standard versus proprietary, 7-bit versus)146 W 7200 45860 MT (8-bit, single-byte versus multibyte, and so on. This paper treats only the) 111 W( character sets used in application-)110 W 7200 47157 MT (independent plain-text) 200 W( files, and not the application-specific text representations used in word processing,)201 W 7200 48454 MT (publishing, and) 70 W( similar environments that are concerned with rendering forms such as fonts, style, point size,)69 W 7200 49751 MT (ligatures, and so) 34 W( forth, and which sometimes offer character repertoires different from any plain-text character)35 W 7200 51048 MT (set.)SH 12.5 /Times-Bold AF 7200 53187 MT (Standard Character Sets)SH 10.5 /Times-Roman AF 7200 54484 MT (Let us define a standard character) 63 W( set as one that is registered in the ISO)62 W /Times-Italic SF 38670 XM (Register of Coded Characters to Be)62 W 7200 55781 MT (Used with Escape Sequences)139 W /Times-Roman SF 20039 XM ([21] under the provisions of ISO Standard 2375) 139 W( [16].) SH( The) 541 W( Register, which) 139 W( is)140 W 7200 57078 MT (maintained by the European Computer Manufacturers Association \050ECMA\051, includes listings of all) 38 W( ISO-regis-)37 W 7200 58375 MT (tered character sets and assigns unique registration numbers and designating escape sequences to each one.)SH 8850 59672 MT (Standard character sets are subdivided into two major types: graphic and control. Thus ASCII, which) 91 W( is)92 W 7200 60969 MT (the USA version of ISO 646, and which most people think of) 72 W( as a single character set, is really two sets: the)71 W 7200 62266 MT (ISO 646) 49 W( 32-character control set \050ISO registration number 001\051 and a 94-character graphics set \050ISO registra-)50 W 7200 63563 MT (tion 006\051. And, as specified by ISO Standard 4873, the characters) 29 W( Space and Delete are not part of ASCII)28 W /Times-Italic SF 52601 XM (per)SH 7200 64860 MT (se)SH /Times-Roman SF (, but rather separate components that must always be available in the presence of a 94-character graphics set.)SH 8850 66157 MT (Similarly, ISO 8859-1 Latin Alphabet 1, which most of us think of as a coherent 8-bit) 67 W( character set is, in)68 W 7200 67454 MT (truth, composed of the ISO 646 control set \050registration 001\051, the ISO 646 USA graphics) 22 W( set \050006\051, the charac-)21 W 7200 68751 MT (ters Space) 265 W( and Delete, a second 32-character control set \050normally, but not necessarily, ISO 6429) 266 W( [20],)SH 7200 70048 MT (registration 077\051, and a) 128 W( 96-character set known as `)127 W 29671 XM (`The Right-hand Part of Latin Alphabet 1')127 W 48127 XM (' \050registration)127 W 7200 71345 MT (100\051. Each) 437 W( of these pieces except Space and Delete has its own unique registration number and designating)87 W 7200 72642 MT (escape sequence. There is no single, unique identifier for the 8-bit Latin-1 character set in its entirety.)SH ES %%Page: 2 2 BS 0 SI 10 /Times-Roman AF 30350 4286 MT (2)SH 46800 50 7200 8017 UL 9 /Courier AF 8820 9713 MT (<--C0--> <---------GL----------><--C1--> <---------GR---------->)SH 9900 10661 MT (00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15)540 W 8820 11609 MT (+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+)SH 7200 12557 MT (00 |) SH( |) 3240 W( SP) SH( |) 10260 W( |) 3240 W( _ |) SH( |)9720 W 7200 13505 MT (01 |) SH( |) 3240 W( |) 11880 W( |) 3240 W( |)11880 W 7200 14453 MT (02 |) SH( |) 3240 W( |) 11880 W( |) 3240 W( |)11880 W 7200 15401 MT (03 |) SH( C |) 1080 W( ASCII) 3240 W( |) 5400 W( C |) 1080 W( Special |)3780 W 7200 16349 MT (04 |) SH( o |) 1080 W( graphics) 3240 W( |) 3780 W( o |) 1080 W( Graphics) 3780 W( |)3240 W 7200 17297 MT (05 |) SH( n |) 1080 W( |) 11880 W( n |) 1080 W( |)11880 W 7200 18245 MT (06 |) SH( t |) 1080 W( |) 11880 W( t |) 1080 W( |)11880 W 7200 19193 MT (07 |) SH( r |) 1080 W( |) 11880 W( r |) 1080 W( |)11880 W 7200 20141 MT (08 |) SH( o |) 1080 W( |) 11880 W( o |) 1080 W( |)11880 W 7200 21089 MT (09 |) SH( l |) 1080 W( |) 11880 W( l |) 1080 W( |)11880 W 7200 22037 MT (10 |) SH( s |) 1080 W( |) 11880 W( s |) 1080 W( |)11880 W 7200 22985 MT (11 |) SH( |) 3240 W( |) 11880 W( |) 3240 W( |)11880 W 7200 23933 MT (12 |) SH( |) 3240 W( |) 11880 W( |) 3240 W( |)11880 W 7200 24881 MT (13 |) SH( |) 3240 W( |) 11880 W( |) 3240 W( |)11880 W 7200 25829 MT (14 |) SH( |) 3240 W( |) 11880 W( |) 3240 W( |)11880 W 7200 26777 MT (15 |) SH( |) 3240 W( DEL|) 10260 W( |) 3240 W( ,) 9720 W( - |)SH 8820 27725 MT (+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+)SH 8820 28673 MT (<--C0--> <---------GL----------><--C1--> <---------GR---------->)SH 46800 50 7200 29887 UL 9.5 /Times-Bold AF 7200 31337 MT (Figure 1:)SH /Times-Roman SF 11396 XM (Structure of a Standard 8-Bit Single-Byte Character Set)SH 10.5 SS 8850 35057 MT (In practice, most standard character sets have the same structure, which is illustrated in Figure 1:)SH 7200 37074 MT (1.)SH 8796 XM (The 32-character control set of ISO 646 in columns 0 and 1. This is called the C0 region.)SH 7200 39091 MT (2.)SH 8796 XM (The character Space at position 2/0.)SH 7200 41108 MT (3.)SH 8796 XM (A 94-character graphics set in positions 2/1 through 7/14. This is called the Graphics Left, or GL, region.)SH 8796 42405 MT (For 7-bit character sets, these are the 84 characters of the ISO 646 International Reference Version plus 10)SH 8796 43702 MT (country-specific characters. For 8-bit sets, these are the 94 graphic characters of US ASCII.)SH 7200 45719 MT (4.)SH 8796 XM (The character Delete at position 7/15.)SH 7200 47736 MT (5.)SH 8796 XM (For 8-bit sets, a 32-character `)SH 21354 XM (`right half')SH 25834 XM (' control set in columns 8 and 9. This is the C1 region.)SH 7200 49753 MT (6.)SH 8796 XM (For 8-bit sets, a 94- or 96-character `)SH 24155 XM (`right half')SH 28635 XM (' graphics set in columns 10 through 15. This is the)SH 8796 51050 MT (Graphics Right or GR region.)SH 7200 53067 MT (7.)SH 8796 XM (In multibyte graphic sets \050not illustrated in the figure\051, each character is composed of a fixed number of)SH 8796 54364 MT (bytes, generally two, with each byte in the graphic range, i.e. not corresponding to a C0 or C1 control)SH 8796 55661 MT (character, and following either the 94- or 96-character structure. Thus, single-byte control characters can)SH 8796 56958 MT (be mixed with double-byte graphic characters with no ambiguity.)SH 7200 58975 MT (The rationale for reserving an area for control characters in the right half) 12 W( of an 8-bit set is that communications)11 W 7200 60272 MT (devices tend to examine only the low-order 7 bits of a character when deciding whether it is a) 162 W( control or)163 W 7200 61569 MT (graphic character. Placing graphic characters in columns 8 and 9 often triggers unwanted control functions.)SH 8850 62866 MT (During the early) 275 W( decades of computing, when 7-bit communication was the rule, ISO 646 was the)274 W 7200 64163 MT (predominant method for) 53 W( representing the special characters of each language, and each European country had)54 W 7200 65460 MT (its own version of ISO 646. But this) 51 W( imposed severe limitations on the user. For example, ISO 646 makes it)50 W 7200 66757 MT (impossible to) 65 W( mix \050say\051 German text and C-language programming syntax in the same file. The following C)66 W 7200 68054 MT (program fragment:)SH 9.5 /Courier AF 7200 69645 MT (if \050~\050a[i] | x\051\051 {)SH 9480 70635 MT (printf\050"Gr)SH /Courier-Accent SF (y_)SH /Courier SF (e aus K)SH /Courier-Accent SF (s)SH /Courier SF (ln\134n"\051;)SH 7200 71625 MT (})SH ES %%Page: 3 3 BS 0 SI 10 /Times-Roman AF 30350 4286 MT (3)SH 10.5 SS 7200 7920 MT (can not be encoded in either ASCII or in German ISO) 43 W( 646, because neither set has all the required characters.)42 W 7200 9217 MT (If German ISO 646 is used, German special characters are substituted for the braces, brackets, and bars:)SH 9.5 /Courier AF 7200 10808 MT (if \050)SH /Courier-Accent SF (_)SH /Courier SF (\050a)SH /Courier-Accent SF (C)SH /Courier SF (i)SH /Courier-Accent SF (Y) 570 MX(s)SH /Courier SF 14610 XM (x\051\051)SH /Courier-Accent SF 16890 XM (c)SH /Courier SF 9480 11798 MT (printf\050"Gr)SH /Courier-Accent SF (y_)SH /Courier SF (e aus K)SH /Courier-Accent SF (s)SH /Courier SF (ln)SH /Courier-Accent SF (S)SH /Courier SF (n"\051;)SH /Courier-Accent SF 7200 12788 MT (y)SH 10.5 /Times-Roman AF 7200 14506 MT (whereas if ASCII is chosen, we see the opposite effect:)SH 9.5 /Courier AF 7200 16097 MT (if \050~\050a[i] | x\051\051 {)SH 9480 17087 MT (printf\050"Gr}~e aus K|ln\134n"\051;)SH 7200 18077 MT (})SH 10.5 /Times-Roman AF 7200 19795 MT (Neither result is) 77 W( satisfactory, and matters only deteriorate when German-language program commentary is to)78 W 7200 21092 MT (be added.)SH 8850 22389 MT (To alleviate these problems, many sites are switching to the ISO 8859) 38 W( Latin alphabets. However, the ISO)37 W 7200 23686 MT (646 versions are still widely used, especially in electronic mail, a predominantly 7-bit medium.)SH 12.5 /Times-Bold AF 7200 25825 MT (Proprietary Character Sets)SH 10.5 /Times-Roman AF 7200 27122 MT (Most computers use either US ASCII) 117 W( [1]) SH( or IBM EBCDIC) 117 W( [13]) SH( as their basic character set. But to support)118 W 7200 28419 MT (entry, display, printing, and processing of textual data in) 139 W( languages other than English, computer manufac-)138 W 7200 29716 MT (turers soon recognized the need to extend these basic sets to allow representation of the) 180 W( accented Roman)181 W 7200 31013 MT (letters, non-Roman letters, ideograms, and other symbols used by the world's writing systems.)SH 8850 32310 MT (Some manufacturers provide ISO 646 national versions, but users) 10 W( suffer with their limitations. Besides the)9 W 7200 33607 MT (sacrifice of characters needed for programming,) 15 W( ISO 646 does not allow mixture of text in different languages,)16 W 7200 34904 MT (such as Italian, French, and Norwegian, in the same) 46 W( file. So manufacturers such as Digital and IBM began to)45 W 7200 36201 MT (devise 8-bit international character sets for the European languages, as well as other sets for) 76 W( languages using)77 W 7200 37498 MT (other writing) 2 W( systems. Most of these 8-bit sets are capable of representing text in several languages, allowing a)1 W 7200 38795 MT (single product to serve, and work compatibly, over a broader market, for example all of Western Europe.)SH 8850 40092 MT (Prominent among the) 123 W( proprietary sets are IBM's PC code pages) 124 W( [13].) SH( They) 511 W( resemble an ISO Latin Al-)124 W 7200 41389 MT (phabet by having ASCII in the left half, but depart from from the standard structure by using all of) 40 W( columns 8)39 W 7200 42686 MT (through 15, including the C1 area,) 80 W( for graphic characters. Thus IBM PC code pages have \050at least\051 32 more)81 W 7200 43983 MT (graphic characters than a standard 8-bit character set. Major manufacturers including Apple and) 25 W( NeXT follow)24 W 7200 45280 MT (the IBM design,) 45 W( but with different character repertoires and encoding. Others, notably Digital, Hewlett Pack-)46 W 7200 46577 MT (ard, and Data General, observe the standard structure, usually with different repertoires and encoding.)SH 8850 47874 MT (To add to the confusion, we also have IBM's many) 59 W( EBCDIC-based code pages, whose structure does not)58 W 7200 49171 MT (follow any national) 90 W( or international standard, as well as variations on IBM-like mainframes manufactured in)91 W 7200 50468 MT (Eastern Europe and the Far East, plus unknown numbers of proprietary sets from other manufacturers.)SH 12.5 /Times-Bold AF 7200 52607 MT (The Current Situation)SH 10.5 /Times-Roman AF 7200 53904 MT (Today we are confronted with hundreds of different coded character sets, both standard and proprietary. These)5 W 7200 55201 MT (sets differ in important ways:)SH /Symbol SF 7200 57338 MT (\267)SH /Times-Italic SF 8796 XM (Size:)SH /Times-Roman SF 11100 XM (The total code space, 7-bit or 8-bit, single byte or multibyte.)SH /Symbol SF 7200 59475 MT (\267)SH /Times-Italic SF 8796 XM (Structure:)SH /Times-Roman SF 13316 XM (Standard or nonstandard allocation of control and graphics areas.)SH /Symbol SF 7200 61612 MT (\267)SH /Times-Italic SF 8796 XM (Repertoire:)SH /Times-Roman SF 13899 XM (The particular selection of characters.)SH /Symbol SF 7200 63749 MT (\267)SH /Times-Italic SF 8796 XM (Encoding:)SH /Times-Roman SF 13434 XM (The particular code values assigned to each character.)SH 7200 65766 MT (Every application-independent plain-text file is encoded in a particular character set. It is) 83 W( generally not pos-)84 W 7200 67063 MT (sible to mix character sets within a plain-text file. Furthermore, a text) 64 W( file generally does not contain any in-)63 W 7200 68360 MT (dication of its) 52 W( character set. Neither, in general, does the host operating system identify a file's character set,)53 W 7200 69657 MT (nor indeed, provide any mechanism to do so.)SH ES %%Page: 4 4 BS 0 SI 10 /Times-Roman AF 30350 4286 MT (4)SH 10.5 SS 8850 7920 MT (Within most applications and computing) 30 W( environments, a certain character set is simply)29 W /Times-Italic SF 46273 XM (assumed)SH /Times-Roman SF (. When) 321 W( a)29 W 7200 9217 MT (workstation supports multiple character sets, or when data must be communicated between unlike) 62 W( computers,)63 W 7200 10514 MT (there is no automatic mechanism for software applications to identify a file's character) 53 W( set, and hence no way)52 W 7200 11811 MT (to automatically display its characters) 68 W( correctly, nor to announce the character set to another computer or ap-)69 W 7200 13108 MT (plication during data transfer.)SH 8850 14405 MT (This problem has grown over the past) 49 W( decade as computers have become increasingly interconnected, and)48 W 7200 15702 MT (so are used increasingly for communication of text: news, conferencing, file) 299 W( transfer and sharing, and)300 W 7200 16999 MT (electronic mail.) 2 W( Not) 265 W( only are text character sets likely to be incompatible, but there are no universally accepted)1 W 7200 18296 MT (methods for translation.)SH 12.5 /Times-Bold AF 7200 20435 MT (Character Set Translation)SH 10.5 /Times-Roman AF 7200 21732 MT (Characters, such) 189 W( as the letter A, are represented in the computer, and in telecommunications, by numeric)190 W 7200 23029 MT (codes. Different) 309 W( computers use different codes for the same character. For example, the letter A is) 23 W( code 65 in)22 W 7200 24326 MT (ASCII, code 193 in EBCDIC, and code 9025 in JIS X 0208.)SH 8850 25623 MT (The most commonly used translation function is a simple array-indexing operation.) 23 W( Suppose) 310 W( we are trans-)24 W 7200 26920 MT (lating from character set)6 W /Times-Italic SF 17723 XM (A)SH /Times-Roman SF 18634 XM (to character set)6 W /Times-Italic SF 25271 XM (B)SH /Times-Roman SF (, and) 6 W( each set has 256 characters, and the characters in each set are)5 W 7200 28217 MT (represented by 8-bit code values in the range 0)8 W /Courier SF (..)SH /Times-Roman SF (255. The) 279 W( translation is accomplished) 8 W( by a linear array of 256)9 W 7200 29514 MT (8-bit elements called a translation table. The) 173 W( table element at position)172 W /Times-Italic SF 38993 XM (i)SH /Times-Roman SF 39720 XM (contains the translation from the)172 W 7200 30811 MT (character in set)88 W /Times-Italic SF 14083 XM (A)SH /Times-Roman SF 15076 XM (whose code is)88 W /Times-Italic SF 21493 XM (i)SH /Times-Roman SF 22136 XM (to the corresponding character in set)88 W /Times-Italic SF 38179 XM (B)SH /Times-Roman SF (, namely its code in set)88 W /Times-Italic SF 49239 XM (B)SH /Times-Roman SF (. For) 441 W( ex-)89 W 7200 32108 MT (ample, the 65th element of an ASCII-to-EBCDIC translation table would be the number 193.)SH 8850 33405 MT (During the translation process, a particular input character,)28 W /Times-Italic SF 34039 XM (c)SH /Times-Roman SF (_)SH /Times-Italic SF (in)SH /Times-Roman SF (, in set)27 W /Times-Italic SF 38963 XM (A)SH /Times-Roman SF 39895 XM (is translated to the output charac-)27 W 7200 34702 MT (ter,)SH /Times-Italic SF 8834 XM (c)SH /Times-Roman SF (_)SH /Times-Italic SF (out)SH /Times-Roman SF (, in set)SH /Times-Italic SF 14202 XM (B)SH /Times-Roman SF 15107 XM (by an array indexing operation as in this example, written in the C programming language:)SH 9.5 /Courier AF 7200 36293 MT (unsigned char a_to_b[256] = { ... }; /* Translation table */)SH 7200 37283 MT (unsigned char c_in, c_out;) SH( /*) 6270 W( Input and output characters */)SH 7200 38986 MT (c_out = a_to_b[c_in];) SH( /*) 9120 W( Translation function */)SH 10.5 /Times-Roman AF 7200 40704 MT (where the)16 W /Times-Italic SF 11606 XM (c)SH /Times-Roman SF (_)SH /Times-Italic SF (in)SH /Times-Roman SF 13693 XM (variable is used as a subscript to the)16 W /Times-Italic SF 29162 XM (a)SH /Times-Roman SF (_)SH /Times-Italic SF (to)SH /Times-Roman SF (_)SH /Times-Italic SF (b)SH /Times-Roman SF 32358 XM (array, and the) 16 W( notation)17 W /Courier SF 42137 XM ({ ... })17 W /Times-Roman SF 46861 XM (stands for the in-)17 W 7200 42001 MT (itialization of the translation table. In practice, the braces contain the quantities forming the table, in the) 77 W( ap-)76 W 7200 43298 MT (propriate order.)SH 8850 44595 MT (In constructing a translation function between any pair of character sets, there are three important and) 2 W( often)3 W 7200 45892 MT (conflicting goals:)SH 7200 47909 MT (1.)SH /Times-Italic SF 8796 XM (Invertibility)SH /Times-Roman SF 14017 XM (\050I\051: After translating text from)SH /Times-Italic SF 26940 XM (A)SH /Times-Roman SF 27845 XM (to)SH /Times-Italic SF 28925 XM (B)SH /Times-Roman SF (, and then back to)SH /Times-Italic SF 37268 XM (A)SH /Times-Roman SF (, is the result identical to the original?)SH 7200 49926 MT (2.)SH /Times-Italic SF 8796 XM (Readability)SH /Times-Roman SF 13901 XM (\050R\051: After translating text from)SH /Times-Italic SF 27174 XM (A)SH /Times-Roman SF 28079 XM (to)SH /Times-Italic SF 29159 XM (B)SH /Times-Roman SF (, is the result readable?)SH 7200 51943 MT (3.)SH /Times-Italic SF 8796 XM (Consistency)SH /Times-Roman SF 14132 XM (\050C\051: Are translations from)SH /Times-Italic SF 25333 XM (A)SH /Times-Roman SF 26238 XM (to)SH /Times-Italic SF 27318 XM (B)SH /Times-Roman SF 28223 XM (by different applications the same?)SH 7200 53960 MT (In attempting to) 104 W( achieve these goals, we must look at the size, structure, and repertoire of the two character)103 W 7200 55257 MT (sets. In) 285 W( many) 11 W( cases, the R-versus-I decision is forced upon us, but in others a choice is possible. For example,)12 W 7200 56554 MT (consider translating between two character sets of the same size: Latin-1 and Latin/Cyrillic. An invertible)149 W 7200 57851 MT (translation is possible that will not be readable, and a readable translation is) 51 W( possible that is not invertible. In)52 W 7200 59148 MT (cases like this, the best course is to let the user set the translation goal.)SH 12.5 /Times-Bold AF 7200 61287 MT (Invertibility)SH 10.5 /Times-Roman AF 7200 62584 MT (Invertibility is) 103 W( important in cases where the exact contents of the file is important, or when the goal of data)102 W 7200 63881 MT (transfer is not necessarily final usage. Suppose, for example, you compose) 50 W( a C-language program \050in ASCII\051)51 W 7200 65178 MT (on your PC, transfer) 67 W( it to an IBM mainframe \050where it is converted to EBCDIC\051, work on it some more, and)66 W 7200 66475 MT (then transfer it back to your PC. If the character-set) 39 W( translation from the PC to the mainframe and back is not)40 W 7200 67772 MT (invertible, you will likely not be able to compile the program again on your PC without syntax errors.)SH 8850 69069 MT (An invertible translation from character set)47 W /Times-Italic SF 27506 XM (A)SH /Times-Roman SF 28458 XM (to character set)47 W /Times-Italic SF 35218 XM (B)SH /Times-Roman SF 36170 XM (is possible only if)46 W /Times-Italic SF 44056 XM (A)SH /Times-Roman SF 45007 XM (is smaller than or the)46 W 7200 70366 MT (same size as)29 W /Times-Italic SF 12739 XM (B)SH /Times-Roman SF (. Similarly,) 322 W( an invertible translation from character set)30 W /Times-Italic SF 36957 XM (B)SH /Times-Roman SF 37892 XM (to character set)30 W /Times-Italic SF 44601 XM (A)SH /Times-Roman SF 45536 XM (is possible only if)30 W /Times-Italic SF 53358 XM (B)SH /Times-Roman SF 7200 71663 MT (is smaller than or the same size) 37 W( as)36 W /Times-Italic SF 22018 XM (A)SH /Times-Roman SF (. So) 335 W( it follows that invertibility can be achieved in both directions only if)36 W 7200 72960 MT (the two character sets are the same size. The following discussion applies only to bidirectional invertibility.)SH ES %%Page: 5 5 BS 0 SI 10 /Times-Roman AF 30350 4286 MT (5)SH 10.5 SS 8850 8040 MT (The intersection of the two) 53 W( character sets)54 W /Times-Italic SF 26751 XM (A)SH /Times-Roman SF 27710 XM (and)SH /Times-Italic SF 29543 XM (B)SH /Times-Roman SF (, written)54 W /Times-Italic SF 34057 XM (A)SH /Symbol SF 34849 XM (\307)SH /Times-Italic SF 35955 XM (B)SH /Times-Roman SF (, is the set of characters,)54 W /Times-Italic SF 47304 XM (c)SH /Times-Roman SF (, that both sets)54 W 7200 9337 MT (have in common, that is, all the characters are members of \050)SH /Symbol SF 32612 XM (\316)SH /Times-Roman SF 33661 XM (\051 both)SH /Times-Italic SF 36404 XM (A)SH /Times-Roman SF 37309 XM (and)SH /Times-Italic SF 39088 XM (B)SH /Times-Roman SF (:)SH /Times-Italic SF 9300 11042 MT (A)SH /Symbol SF 10092 XM (\307)SH /Times-Italic SF 11198 XM (B)SH /Times-Roman SF 12140 XM (= {)37 W /Times-Italic SF 13686 XM (c)SH /Times-Roman SF 14752 XM (:)SH /Times-Italic SF 15644 XM (c)SH /Symbol SF 16410 XM (\316)SH /Times-Italic SF 17459 XM (A)SH /Times-Roman SF (,)SH /Times-Italic SF 18814 XM (c)SH /Symbol SF 19580 XM (\316)SH /Times-Italic SF 20629 XM (B)SH /Times-Roman SF 21421 XM (})SH 7200 12627 MT (The characters in)SH /Times-Italic SF 14694 XM (A)SH /Times-Roman SF 15599 XM (that are)SH /Times-Italic SF 18982 XM (not)SH /Times-Roman SF 20587 XM (in)SH /Times-Italic SF 21667 XM (B)SH /Times-Roman SF 22572 XM (can be written as:)SH /Times-Italic SF 9300 14332 MT (A)SH /Times-Roman SF 10092 XM (\134)SH /Times-Italic SF 10684 XM (B)SH /Times-Roman SF 11626 XM (= {)37 W /Times-Italic SF 13172 XM (c)SH /Times-Roman SF 14238 XM (:)SH /Times-Italic SF 15130 XM (c)SH /Symbol SF 15896 XM (\316)SH /Times-Italic SF 16945 XM (A)SH /Times-Roman SF (,)SH /Times-Italic SF 18450 XM (c)SH /Symbol SF 19216 XM (\317)SH /Times-Italic SF 20265 XM (B)SH /Times-Roman SF 21057 XM (})SH 7200 15917 MT (and the characters in)SH /Times-Italic SF 16123 XM (B)SH /Times-Roman SF 17028 XM (that are not in)SH /Times-Italic SF 23096 XM (A)SH /Times-Roman SF 24001 XM (are:)SH /Times-Italic SF 9300 17622 MT (B)SH /Times-Roman SF 10092 XM (\134)SH /Times-Italic SF 10684 XM (A)SH /Times-Roman SF 11626 XM (= {)37 W /Times-Italic SF 13172 XM (c)SH /Times-Roman SF 14238 XM (:)SH /Times-Italic SF 15130 XM (c)SH /Symbol SF 15896 XM (\316)SH /Times-Italic SF 16945 XM (B)SH /Times-Roman SF (,)SH /Times-Italic SF 18450 XM (c)SH /Symbol SF 19216 XM (\317)SH /Times-Italic SF 20265 XM (A)SH /Times-Roman SF 21057 XM (})SH 8850 19327 MT (To make an) 76 W( invertible translation table, the characters of)75 W /Times-Italic SF 33501 XM (A)SH /Symbol SF 34293 XM (\307)SH /Times-Italic SF 35399 XM (B)SH /Times-Roman SF 36379 XM (are paired together: the letter `)75 W 49399 XM (`E')SH 50641 XM (' in one)75 W 7200 20624 MT (set to `)15 W 9989 XM (`E')SH 11231 XM (' in the other, `)15 W 17464 XM (`)SH /Times-Accent SF (K)SH /Times-Roman SF (')SH 18706 XM (' in one) 15 W( set to `)16 W 25015 XM (`)SH /Times-Accent SF (K)SH /Times-Roman SF (')SH 26257 XM (' in the other, and so on. The characters in)16 W /Times-Italic SF 44664 XM (A)SH /Times-Roman SF 45456 XM (\134)SH /Times-Italic SF 46048 XM (B)SH /Times-Roman SF 46969 XM (are paired 1-to-1)16 W 7200 21921 MT (with the characters in)19 W /Times-Italic SF 16550 XM (B)SH /Times-Roman SF 17342 XM (\134)SH /Times-Italic SF 17934 XM (A)SH /Times-Roman SF 18858 XM (according to)19 W /Times-Italic SF 24379 XM (some criterion:)19 W /Times-Roman SF 31387 XM (readability, consistency, whimsy, or caprice. The ex-)18 W 7200 23218 MT (act method for pairing the leftovers is problematic, and frequently a particular pair makes no) 72 W( sense at all, for)73 W 7200 24515 MT (example `)SH 11270 XM (`L-with-stroke')SH 17645 XM (' with `)SH 20638 XM (`Vulgar fraction 3/4')SH 29288 XM ('.)SH 8850 25812 MT (Any 1-to-1 pairing will give an invertible translation, but to achieve the) 31 W( most useful translation it is neces-)30 W 7200 27109 MT (sary to examine all the character sets) 36 W( involved. To illustrate, Latin Alphabet 1 lacks the OE digraph character)37 W 7200 28406 MT (but this character is found in the Digital Multinational character set, the Apple Quickdraw set, the Hewlett)136 W 7200 29703 MT (Packard Roman8 set, the Data General) 152 W( International set, and the NeXT character set, but at different code)153 W 7200 31000 MT (points in each. Ideally, the translations for each of these character sets would map OE digraph into) 64 W( the same)63 W 7200 32297 MT (Latin-1 code point, so that text translated from \050say\051 NeXT to) 41 W( Latin-1 and thence to Data General would keep)42 W 7200 33594 MT (its OE intact. But this would require an unprecedented degree of cooperation among competing manufac-)149 W 7200 34891 MT (turers.)SH 8850 36188 MT (The construction) 37 W( of invertible translations between private and standard character sets is beyond the scope)38 W 7200 37485 MT (of the national and international standards organizations, nor) 83 W( should these translations be made arbitrarily by)82 W 7200 38782 MT (programmers. Translation) 311 W( tables \050or algorithms\051 are most appropriately) 24 W( furnished by the creators or owners of)25 W 7200 40079 MT (each private character) 62 W( set. This lends the appropriate `)61 W 30631 XM (`official')SH 34264 XM (' air and allows all software developers to use)61 W 7200 41376 MT (the same translations, thus promoting interoperability of diverse applications. In) 6 W( 1990, IBM became one of the)7 W 7200 42673 MT (few computer manufacturers to take this step when it published invertible tables between ISO 8859-1 and its)74 W 7200 43970 MT (code pages 500 and 850 in its Character Data Representation Architecture Registry [13].)SH 8850 45267 MT (Other translations, however, are lacking from IBM, for example between its Cyrillic) 101 W( code pages and the)102 W 7200 46564 MT (ISO Latin/Cyrillic alphabet; similarly for) 23 W( Hebrew, Arabic, Greek, and so on. Official invertible translations of)22 W 7200 47861 MT (any kind seem to be entirely lacking from most other computer and software makers.)SH 12.5 /Times-Bold AF 7200 50000 MT (A Simple Rule)SH 10.5 /Times-Roman AF 7200 51297 MT (In the absence of an official translation,) 203 W( a simple procedure can be used to produce consistent invertible)204 W 7200 52594 MT (translations across all applications. Let us assume that)39 W /Times-Italic SF 30767 XM (P)SH /Times-Roman SF 31711 XM (is a private nonstandard character set, and that)39 W /Times-Italic SF 51707 XM (S)SH /Times-Roman SF 52533 XM (is a)38 W 7200 53891 MT (standard character set, and that)95 W /Times-Italic SF 20914 XM (P)SH /Times-Roman SF 21914 XM (and)SH /Times-Italic SF 23789 XM (S)SH /Times-Roman SF 24673 XM (are the same size,)96 W /Times-Italic SF 32726 XM (n)SH /Times-Roman SF (. Follow) 455 W( these steps to construct the translation)96 W 7200 55188 MT (table from)SH /Times-Italic SF 11809 XM (P)SH /Times-Roman SF 12714 XM (to)SH /Times-Italic SF 13794 XM (S)SH /Times-Roman SF 14582 XM (as an array,)SH /Times-Italic SF 19656 XM (p)SH /Times-Roman SF (_)SH /Times-Italic SF (to)SH /Times-Roman SF (_)SH /Times-Italic SF (s)SH /Times-Roman SF (, of)SH /Times-Italic SF 24120 XM (n)SH /Times-Roman SF 24908 XM (elements:)SH 7200 57325 MT (1.)SH 8796 XM (The characters that are common to)SH /Times-Italic SF 23611 XM (P)SH /Times-Roman SF 24516 XM (and)SH /Times-Italic SF 26295 XM (S)SH /Times-Roman SF (,)SH /Times-Italic SF 27346 XM (P)SH /Symbol SF 28138 XM (\307)SH /Times-Italic SF 29244 XM (S)SH /Times-Roman SF (, are mapped together. For each such character in)SH /Times-Italic SF 50942 XM (P)SH /Times-Roman SF (,)SH 8796 58622 MT (whose code value is)SH /Times-Italic SF 17486 XM (i)SH /Times-Roman SF (,)SH /Times-Italic SF 18304 XM (p)SH /Times-Roman SF (_)SH /Times-Italic SF (to)SH /Times-Roman SF (_)SH /Times-Italic SF (s)SH /Times-Roman SF ([)SH /Times-Italic SF (i)SH /Times-Roman SF (] takes the corresponding character's code value from)SH /Times-Italic SF 44550 XM (S)SH /Times-Roman SF (.)SH 7200 60639 MT (2.)SH 8796 XM (The members of)SH /Times-Italic SF 15942 XM (A)SH /Times-Roman SF 16734 XM (\134)SH /Times-Italic SF 17326 XM (B)SH /Times-Roman SF 18231 XM (and)SH /Times-Italic SF 20010 XM (B)SH /Times-Roman SF 20802 XM (\134)SH /Times-Italic SF 21394 XM (A)SH /Times-Roman SF 22299 XM (are paired with each other in)SH /Times-Italic SF 34548 XM (code order)SH /Times-Roman SF (.)SH 7200 62656 MT (This procedure guarantees that)SH /Times-Italic SF 20381 XM (p)SH /Times-Roman SF (_)SH /Times-Italic SF (to)SH /Times-Roman SF (_)SH /Times-Italic SF (s)SH /Times-Roman SF 23444 XM (has exactly)SH /Times-Italic SF 28401 XM (n)SH /Times-Roman SF 29189 XM (unique elements.)SH 8850 63953 MT (Step \0501\051 is not always easy.) 17 W( All) 295 W( too frequently, private character sets are documented only by tables show-)16 W 7200 65250 MT (ing the graphic characters, often unclearly \050as when working from a fax of third-generation) 73 W( photocopy\051, with)74 W 7200 66547 MT (no names or other identifiers assigned to the characters.) 60 W( Even) 381 W( when the material is legible and names are as-)59 W 7200 67844 MT (signed, conventions) 135 W( for graphic representation differ, and so do the names. Thus, some knowledge of lan-)136 W 7200 69141 MT (guages, writing systems, world and corporate cultures, history, and politics is helpful.)SH 8850 70438 MT (Let's say that character set)34 W /Times-Italic SF 20480 XM (P)SH /Times-Roman SF 21419 XM (consists of four characters, the letters A, B, C, and) 34 W( D, whose code values are)33 W 7200 71735 MT (0, 1, 2, and 3, respectively. And set)29 W /Times-Italic SF 22806 XM (S)SH /Times-Roman SF 23623 XM (consists of the letters) 29 W( B, X, A, and Y, also with code values 0, 1, 2, and)30 W ES %%Page: 6 6 BS 0 SI 10 /Times-Roman AF 30350 4286 MT (6)SH 10.5 SS 7200 7920 MT (3, in that order.) 38 W( The) 338 W( letters A and B are common to both sets. A is represented by code 0 in)37 W /Times-Italic SF 47085 XM (P)SH /Times-Roman SF 48027 XM (and by code 2)37 W 7200 9217 MT (in)SH /Times-Italic SF 8280 XM (S)SH /Times-Roman SF (, so:)SH /Times-Italic SF 9300 10802 MT (p)SH /Times-Roman SF (_)SH /Times-Italic SF (to)SH /Times-Roman SF (_)SH /Times-Italic SF (s)SH /Times-Roman SF ([0] = 2)37 W 7200 12387 MT (Similarly for the letter B:)SH /Times-Italic SF 9300 13972 MT (p)SH /Times-Roman SF (_)SH /Times-Italic SF (to)SH /Times-Roman SF (_)SH /Times-Italic SF (s)SH /Times-Roman SF ([1] = 0)37 W 7200 15557 MT (Positions 2 and 3 of our translation array remain empty, so we assign them in code order:)SH /Times-Italic SF 9300 17142 MT (p)SH /Times-Roman SF (_)SH /Times-Italic SF (to)SH /Times-Roman SF (_)SH /Times-Italic SF (s)SH /Times-Roman SF ([2] = 1)37 W /Times-Italic SF 9300 18292 MT (p)SH /Times-Roman SF (_)SH /Times-Italic SF (to)SH /Times-Roman SF (_)SH /Times-Italic SF (s)SH /Times-Roman SF ([3] = 3)37 W 7200 19877 MT (and the translation from)SH /Times-Italic SF 17526 XM (P)SH /Times-Roman SF 18431 XM (to)SH /Times-Italic SF 19511 XM (S)SH /Times-Roman SF 20299 XM (is complete. Each element of the)SH /Times-Italic SF 34533 XM (p)SH /Times-Roman SF (_)SH /Times-Italic SF (to)SH /Times-Roman SF (_)SH /Times-Italic SF (s)SH /Times-Roman SF 37596 XM (array has a unique value.)SH 8850 21174 MT (To create the) 10 W( reverse translation table from)11 W /Times-Italic SF 27269 XM (S)SH /Times-Roman SF 28068 XM (to)SH /Times-Italic SF 29159 XM (P)SH /Times-Roman SF (,)SH /Times-Italic SF 30338 XM (s)SH /Times-Roman SF (_)SH /Times-Italic SF (to)SH /Times-Roman SF (_)SH /Times-Italic SF (p)SH /Times-Roman SF (, we could repeat the process in the reverse direc-)11 W 7200 22471 MT (tion, or, equivalently \050and) 27 W( more safely\051, simply turn the)26 W /Times-Italic SF 31008 XM (p)SH /Times-Roman SF (_)SH /Times-Italic SF (to)SH /Times-Roman SF (_)SH /Times-Italic SF (s)SH /Times-Roman SF 34097 XM (table `)26 W 36677 XM (`inside out')26 W 41416 XM (' by sorting it according to its)26 W 7200 23768 MT (values. Here) 263 W( is a C language program fragment that does the job:)SH 9.5 /Courier AF 7200 25359 MT (for \050i = 0; i < n; i++\051)SH 8340 26349 MT (s_to_p[p_to_s[i]] = i;)SH 10.5 /Times-Roman AF 8850 28067 MT (This leaves us with the two translation tables:)SH /Times-Italic SF 8775 29745 MT (Index p_to_s s_to_p)263 W 8.5 /Times-Roman AF 9650 30913 MT (0)SH 12975 XM (2)SH 16184 XM (1)SH 9650 31882 MT (1)SH 12975 XM (0)SH 16184 XM (2)SH 9650 32851 MT (2)SH 12975 XM (1)SH 16184 XM (0)SH 9650 33820 MT (3)SH 12975 XM (3)SH 16184 XM (3)SH 10.5 SS 8850 35500 MT (Naturally, such an arbitrary method will please few) 19 W( \050hence the foregoing plea for more-sensible)20 W /Times-Italic SF 49574 XM (official)SH /Times-Roman SF 52833 XM (in-)SH 7200 36797 MT (vertible translations\051; in this case C becomes X) 9 W( and vice versa. But)8 W /Times-Italic SF 35952 XM (vice-versa)SH /Times-Roman SF 40536 XM (is exactly what is needed for in-)8 W 7200 38094 MT (vertibility and consistency. Those characters the two sets have in common are translated) 13 W( readably, and the rest)14 W 7200 39391 MT (are translated according to the Simple Rule for)SH /Times-Italic SF 27092 XM (consistent)SH /Times-Roman SF 31554 XM (invertibility.)SH 8850 40688 MT (In a more useful) 203 W( application of the Simple Rule, let us construct an invertible mapping between two)202 W 7200 41985 MT (real-life, 8-bit, single-byte) 64 W( character sets, Data General International \050DGI\051) 65 W( [7]) SH( and ISO 8859-1) 65 W( [18],) SH( between)65 W 7200 43282 MT (which there is no official invertible mapping. We begin by finding all the characters) 20 W( from DGI that are also in)19 W 7200 44579 MT (Latin-1 \05080 of them\051) 128 W( and make the appropriate mappings. We are left with two lists of sixteen unmatched)129 W 7200 45876 MT (characters. Applying) 389 W( the Simple Rule, the lists are sorted) 63 W( in code order and placed side-by-side to obtain the)62 W 7200 47173 MT (following correspondence:)SH 9300 49043 MT (160 Undefined)997 W 30600 XM (160 No-break) 997 W( space)SH 9300 50193 MT (175 Double) 997 W( dagger)SH 30600 XM (166 Broken) 997 W( bar)SH 9300 51343 MT (179 Trade) 997 W( mark uncircled)SH 30600 XM (173 Soft) 997 W( hyphen)SH 9300 52493 MT (180 Florin) 997 W( sign)SH 30600 XM (175 Macron)997 W 9300 53643 MT (183 Less-than-or-equal) 997 W( sign)SH 30600 XM (184 Cedilla)997 W 9300 54793 MT (184 Greater-than-or-equal) 997 W( sign)SH 30600 XM (185 Superscript) 997 W( one)SH 9300 55943 MT (186 Grave) 997 W( accent)SH 30600 XM (188 Vulgar) 997 W( fraction one quarter)SH 9300 57093 MT (191 Up) 997 W( arrow)SH 30600 XM (190 Vulgar) 997 W( fraction three quarters)SH 9300 58243 MT (215 Capital) 997 W( OE digraph)SH 30600 XM (208 Capital) 997 W( Icelandic letter Eth)SH 9300 59393 MT (220 Undefined)997 W 30600 XM (215 Multiplication) 997 W( sign)SH 9300 60543 MT (221 Uppercase) 997 W( letter Y with diaeresis)SH 30600 XM (221 Capital) 997 W( letter Y with acute accent)SH 9300 61693 MT (222 Undefined)997 W 30600 XM (222 Capital) 997 W( Icelandic letter Thorn)SH 9300 62843 MT (223 Undefined)997 W 30600 XM (240 Small) 997 W( Icelandic letter eth)SH 9300 63993 MT (247 Small) 997 W( oe digraph)SH 30600 XM (247 Division) 997 W( sign)SH 9300 65143 MT (254 Undefined)997 W 30600 XM (253 Small) 997 W( letter y with acute accent)SH 9300 66293 MT (255 Fill) 997 W( character light)SH 30600 XM (254 Small) 997 W( Icelandic letter thorn)SH 7200 68163 MT (So if)48 W /Times-Italic SF 9573 XM (P)SH /Times-Roman SF 10526 XM (is DGI and)48 W /Times-Italic SF 15541 XM (S)SH /Times-Roman SF 16377 XM (is Latin-1, then)48 W /Times-Italic SF 23173 XM (p)SH /Times-Roman SF (_)SH /Times-Italic SF (to)SH /Times-Roman SF (_)SH /Times-Italic SF (s)SH /Times-Roman SF ([160] = 160,)37 W /Times-Italic SF 31590 XM (p)SH /Times-Roman SF (_)SH /Times-Italic SF (to)SH /Times-Roman SF (_)SH /Times-Italic SF (s)SH /Times-Roman SF ([175] = 166,)37 W /Courier SF 40007 XM (...)SH /Times-Roman SF (,)SH /Times-Italic SF 42472 XM (p)SH /Times-Roman SF (_)SH /Times-Italic SF (to)SH /Times-Roman SF (_)SH /Times-Italic SF (s)SH /Times-Roman SF ([255] = 254,) 37 W( and the)49 W /Times-Italic SF 7200 69460 MT (P)SH /Times-Roman SF 8169 XM (to)SH /Times-Italic SF 9313 XM (S)SH /Times-Roman SF 10165 XM (mapping is complete. The)64 W /Times-Italic SF 21920 XM (s)SH /Times-Roman SF (_)SH /Times-Italic SF (to)SH /Times-Roman SF (_)SH /Times-Italic SF (p)SH /Times-Roman SF 25047 XM (array is obtained by exchanging the index and value of each)63 W /Times-Italic SF 51200 XM (p)SH /Times-Roman SF (_)SH /Times-Italic SF (to)SH /Times-Roman SF (_)SH /Times-Italic SF (s)SH /Times-Roman SF 7200 70757 MT (element, as in the program fragment given above.)SH ES %%Page: 7 7 BS 0 SI 10 /Times-Roman AF 30350 4286 MT (7)SH 12.5 /Times-Bold AF 7200 8038 MT (Readability)SH 10.5 /Times-Roman AF 7200 9335 MT (Bidirectional invertibility cannot be achieved) 82 W( when the character sets are different sizes, nor can invertibility)83 W 7200 10632 MT (be achieved from a larger set to a smaller set.) 40 W( In) 341 W( such cases, readability becomes the only sensible translation)39 W 7200 11929 MT (goal. Even) 263 W( in cases where invertibility is possible, readability might be preferred for a particular data transfer.)SH 8850 13226 MT (When translating from a larger set,)41 W /Times-Italic SF 24000 XM (A)SH /Times-Roman SF (, to a smaller) 41 W( one,)42 W /Times-Italic SF 32580 XM (B)SH /Times-Roman SF (, several different characters in)42 W /Times-Italic SF 46671 XM (A)SH /Times-Roman SF 47618 XM (can be mapped)42 W 7200 14523 MT (to a single character in)SH /Times-Italic SF 16970 XM (B)SH /Times-Roman SF (. For) 263 W( example, the following Latin-1 characters:)SH 12.5 /Times-Accent AF 7200 16678 MT (d) 626 MX(a) 626 MX(b) 626 MX(f) 626 MX(c) 626 MX(e) 626 MX(3)SH 10.5 /Times-Roman AF 7200 18695 MT (might all be mapped to the letter `)119 W 22198 XM (`a')SH 23264 XM (' when) 119 W( translating to ASCII. In the resulting ASCII file, we can't tell)118 W 7200 19992 MT (where a particular `)126 W 15672 XM (`a')SH 16738 XM (' came from, so we can't reconstruct the original Latin-1 text when translating in) 126 W( the)127 W 7200 21289 MT (reverse direction. But the ASCII file is more intelligible than if we had used some other) 116 W( mapping, such as)115 W 7200 22586 MT (simply stripping off the high-order bit. Translation) 113 W( by removing diacritics is useful with Roman-based lan-)114 W 7200 23883 MT (guages, such as French; `)40 W 17846 XM (`p)SH /Times-Accent SF (b)SH /Times-Roman SF (t)SH /Times-Accent SF (h)SH /Times-Roman SF (')SH 20195 XM (' becomes `)40 W 25074 XM (`pate')SH 27423 XM (' rather than \050say\051 `)40 W 35591 XM (`pbti')SH 37825 XM ('. Or) 343 W( German: `)39 W 44680 XM (`Gr)SH /Times-Accent SF (y_)SH /Times-Roman SF (e aus K)39 W /Times-Accent SF (s)SH /Times-Roman SF (ln')SH 52007 XM (' be-)39 W 7200 25180 MT (comes `)SH 10395 XM (`Gruse aus Koln')SH 17527 XM (' instead of `)SH 22765 XM (`Gr)SH /Courier SF (|_)SH /Times-Roman SF (e aus Kvln')SH 30224 XM ('.)SH 8850 26477 MT (In German, the words `)194 W 19297 XM (`Gruse')SH 22404 XM (' and `)194 W 25434 XM (`Gr)SH /Times-Accent SF (y_)SH /Times-Roman SF (e')SH 28658 XM (' have entirely different meanings \050we don't want to) 194 W( say)195 W 7200 27774 MT (`)SH 7450 XM (`soot')SH 9800 XM (' when we mean `)147 W 17812 XM (`greetings')SH 22261 XM ('\051. We) 557 W( can do) 147 W( better. European languages like German, Swedish, Nor-)146 W 7200 29071 MT (wegian, Danish, Icelandic, and) 98 W( Dutch have rules for converting accented or other special characters into un-)99 W 7200 30368 MT (adorned ABC's. For example, any German vowel with an umlaut \050diaeresis\051) 3 W( can be written without the umlaut)2 W 7200 31665 MT (and followed by the) 36 W( letter `)37 W 18687 XM (`e')SH 19753 XM ('. These) 337 W( rules are specific to each language. So while we can write the German)37 W 7200 32962 MT (word `)SH 9871 XM (`K)SH /Times-Accent SF (s)SH /Times-Roman SF (ln')SH 12571 XM (' as `)SH 14571 XM (`Koeln')SH 17737 XM (', we cannot write the English word `)SH 33272 XM (`co)SH /Times-Accent SF (s)SH /Times-Roman SF (peration')SH 38829 XM (' as `)SH 40829 XM (`cooepoeration')SH 47377 XM ('.)SH 8850 34259 MT (Such language rules can not be applied blindly in) 83 W( reverse. For example, if `)82 W 41758 XM (`oe')SH 43349 XM (' were translated back to)82 W 7200 35556 MT (`)SH 7450 XM (`)SH /Times-Accent SF (s)SH /Times-Roman SF (')SH 8575 XM (', then `)791 W 13354 XM (`Kommandoebene')SH 21360 XM (' would become `)791 W 31012 XM (`Kommand)SH /Times-Accent SF (s)SH /Times-Roman SF (bene')SH 38552 XM (' \050not a German word\051,) 791 W( and)792 W 7200 36853 MT (AUTOEXEC)SH /Courier SF (.)SH /Times-Roman SF (BAT would become AUT)SH /Times-Accent SF (S)SH /Times-Roman SF (XEC)SH /Courier SF (.)SH /Times-Roman SF (BAT \050a PC file that you don't want to rename!\051.)SH 8850 38150 MT (Construction of a readable translation between two entirely different alphabets, such as Cyrillic and)256 W 7200 39447 MT (Roman, is called transliteration. The specific) 4 W( transliteration rules must take into account not only the alphabets)5 W 7200 40744 MT (themselves, but also what languages they represent. For example, the surname of) 3 W( a former leader of the former)2 W 7200 42041 MT (USSR,)SH 9.5 /CyrillicGothic AF 10357 XM (Kru7ev)SH 10.5 /Times-Roman AF (, is transliterated into Roman letters as `)5 W 30613 XM (`Khrushchev')SH 36286 XM (' in English, but into) 5 W( `)6 W 45492 XM (`Khruschtschew')SH 52564 XM (' in)6 W 7200 43338 MT (German.)SH 8850 44635 MT (Newspapers and magazines, libraries, immigrant bureaus, and other organizations have their own) 37 W( standard)36 W 7200 45932 MT (procedures for transliterating `)186 W 20461 XM (`foreign')SH 24094 XM (' writing) 186 W( systems. Not just in `)187 W 38373 XM (`ASCII-speaking')SH 45797 XM (' lands, but every-)187 W 7200 47229 MT (where: Russian names are written in Arabic newspapers, Hebrew names in Greek journals,) 62 W( English names on)61 W 7200 48526 MT (Chinese passports, Korean publications in Vietnamese library catalogs. But these standards are not widely)139 W 7200 49823 MT (known. When) 263 W( a standard can be found, use it. If not, look harder.)SH 15.5 /Times-Bold AF 7200 52427 MT (Character-Set Translation in the Kermit File Transfer Protocol)SH 10.5 /Times-Roman AF 7200 53724 MT (The Kermit File Transfer Protocol was developed at Columbia University to allow the transfer of) 13 W( both text and)12 W 7200 55021 MT (binary files among) 9 W( all types of personal computers, minicomputers, and mainframes, in both the 7-bit and 8-bit)10 W 7200 56318 MT (communication environments. Kermit is a layered, point-to-point,) 245 W( transport-independent, error-correcting)244 W 7200 57615 MT (packet protocol described in detail elsewhere [5].)SH 8850 58912 MT (Transfer of text files between unlike computers requires conversion of both record format and character) 10 W( set)11 W 7200 60209 MT (at the presentation layer. For example, a document composed under the UNIX operating system using the AS-)13 W 7200 61506 MT (CII character set with lines separated by) 19 W( imbedded Linefeed characters, upon transfer to an IBM mainframe, is)20 W 7200 62803 MT (converted to EBCDIC encoding and a mainframe-specific variable- or fixed-length record format. Kermit ac-)41 W 7200 64100 MT (complishes this conversion with another Simple Rule: during file transfer, the character) 4 W( set used for text files is)5 W 7200 65397 MT (ASCII, and the record format is stream, with records \050lines\051 delimited by Carriage Return and Linefeed.)SH 8850 66694 MT (Thus, it is the responsibility of each Kermit program to convert between the) 45 W( text character sets and record)44 W 7200 67991 MT (formats of its own computer and the standard Kermit format. This means that no Kermit program needs) 106 W( to)107 W 7200 69288 MT (know the specific codes) 6 W( and formats of any kind of computer except its own, and it forms the basis of Kermit's)5 W 7200 70585 MT (strategy for converting between different character sets.) 198 W( This) 661 W( idea is known as a `)199 W 44264 XM (`common intermediate)199 W 7200 71882 MT (representation,')SH 13661 XM (' and it lies at the heart of any presentation-layer protocol [26].)SH ES %%Page: 8 8 BS 0 SI 10 /Times-Roman AF 30350 4286 MT (8)SH 10.5 SS 8850 7920 MT (By the mid 1980s, Kermit had become a de facto standard for file transfer. Kermit software programs) 26 W( had)25 W 7200 9217 MT (been written) 6 W( for almost every kind of computer in existence. But the Kermit protocol lacked a formal and con-)7 W 7200 10514 MT (sistent means) 119 W( for exchanging text that contained non-Roman or accented Roman characters. Files could be)118 W 7200 11811 MT (transferred, but the results would be gibberish unless the receiving computer supported the same character set)52 W 7200 13108 MT (as the sender.)SH 8850 14405 MT (At first, this problem was remedied by pre- or postprocessing. But) 77 W( this approach places an unreasonable)76 W 7200 15702 MT (burden on the user. Not only must extra steps be taken, but a suitable translation utility) 117 W( must be found for)118 W 7200 16999 MT (every pair of character sets. More subtly, translation utilities \050for example, between an IBM code) 58 W( page and a)57 W 7200 18296 MT (Macintosh character set\051 are) 168 W( constructed in an ad-hoc manner, with no guarantee of consistency from one)169 W 7200 19593 MT (utility to another.)SH 8850 20890 MT (The problem is compounded by the) 70 W( rapid proliferation of proprietary, national, and international standard)69 W 7200 22187 MT (character sets. By the late) 196 W( 1980s, there were many encodings for each major writing system, a problem)197 W 7200 23484 MT (pointed out by attendees) 2 W( at international conference sessions on Kermit in Europe and Japan) 1 W( [10].) SH( A) 265 W( consistent)1 W 7200 24781 MT (approach to character-set translation had become an urgent matter.)SH 12.5 /Times-Bold AF 7200 26920 MT (Basic Design Principles)SH 10.5 /Times-Roman AF 7200 28217 MT (How can we enable) 112 W( meaningful exchange of text between any two computers? The obvious approach is to)113 W 7200 29514 MT (require each) 109 W( data-transfer application to understand every character set in existence. This works adequately)108 W 7200 30811 MT (when the number of sets is small and) 52 W( stable, but quickly becomes unwieldy and unmanageable as the number)53 W 7200 32108 MT (increases. If) 263 W( the number of character sets is)SH /Times-Italic SF 25864 XM (n)SH /Times-Roman SF (, the number of translations is:)SH /Times-Italic SF 15365 33944 MT (n)SH /Times-Roman SF 16040 XM (!)SH /Times-Italic SF 10166 34134 MT (n)SH 4577 50 13589 34374 UL /Times-Roman SF 12277 34619 MT (= =)5754 W /Times-Italic SF 20198 XM (n)SH /Symbol SF 21023 XM (\264)SH /Times-Roman SF 21899 XM (\050)SH /Times-Italic SF (n)SH /Symbol SF 22924 XM (-)SH /Times-Roman SF 23650 XM (1\051)SH /Times-Bold SF 50675 XM (\0501\051)SH 26 /Symbol AF 9300 35053 MT (\050)SH 10691 XM (\051)SH 10.5 /Times-Roman AF 10166 35284 MT (2)SH 13589 35324 MT (2!)SH /Symbol SF 14614 XM (\264)SH /Times-Roman SF 15340 XM (\050)SH /Times-Italic SF (n)SH /Symbol SF (-)SH /Times-Roman SF (2\051) 150 MX(!)SH 7200 36909 MT (If we have two character sets,)32 W /Times-Italic SF 20135 XM (A)SH /Times-Roman SF 21072 XM (and)SH /Times-Italic SF 22883 XM (B)SH /Times-Roman SF (, we need two translations, one from)32 W /Times-Italic SF 39296 XM (A)SH /Times-Roman SF 40233 XM (to)SH /Times-Italic SF 41345 XM (B)SH /Times-Roman SF 42282 XM (and one from)32 W /Times-Italic SF 48241 XM (B)SH /Times-Roman SF 49178 XM (to)SH /Times-Italic SF 50289 XM (A)SH /Times-Roman SF (. If) 325 W( we)31 W 7200 38206 MT (have three sets\320)SH /Times-Italic SF (A)SH /Times-Roman SF (,)SH /Times-Italic SF 15599 XM (B)SH /Times-Roman SF (, and)SH /Times-Italic SF 18546 XM (C)SH /Times-Roman SF (\320we need 3)SH /Symbol SF 24703 XM (\264)SH /Times-Roman SF 25429 XM (2 = 6) 37 W( translations:)SH /Times-Italic SF 33330 XM (AB)SH /Times-Roman SF (,)SH /Times-Italic SF 35140 XM (BA)SH /Times-Roman SF (,)SH /Times-Italic SF 36950 XM (AC)SH /Times-Roman SF (,)SH /Times-Italic SF 38818 XM (CA)SH /Times-Roman SF (,)SH /Times-Italic SF 40686 XM (BC)SH /Times-Roman SF (, and)SH /Times-Italic SF 44333 XM (CB)SH /Times-Roman SF (. And) 263 W( so on.)SH 8850 39503 MT (Now consider that in 1990, IBM) 389 W( alone listed 276 different coded character-set identifiers in its)390 W 7200 40800 MT (registry [13].) SH( If) 448 W( we wanted translations between every pair of IBM character sets, there would be 75,900 of)92 W 7200 42097 MT (them! Add) 263 W( in all the other sets from all the other companies to appreciate the magnitude of the problem.)SH 8850 43394 MT (By using a standard intermediate) 85 W( representation for each type of character set \050Roman, Cyrillic, Hebrew,)86 W 7200 44691 MT (Japanese, etc\051, we eliminate the need for any particular computer to know about the character sets used by) 24 W( any)23 W 7200 45988 MT (other kind of computer. Kermit's common) 121 W( intermediate character set, previously always ASCII, is now al-)122 W 7200 47285 MT (lowed to be any of a small number) 32 W( of character sets. The set used during a particular file transfer is called the)31 W 7200 48582 MT (transfer character set \050TCS\051.)SH 8850 49879 MT (The character set) 27 W( of the file that is being sent or received is called the file character set \050FCS\051. The sender)28 W 7200 51176 MT (translates from) 52 W( its local codes \050the FCS\051 to the standard ones \050the TCS\051, and the receiver translates from TCS)51 W 7200 52473 MT (codes to its own FCS, as shown in Figure 2.)SH 8850 53770 MT (For a particular file and transfer) 50 W( character set combination, a Kermit program has one translation function)51 W 7200 55067 MT (for sending files and another for receiving them. Theoretically, all combinations of file and) 45 W( transfer character)44 W 7200 56364 MT (set are allowed. Thus the number of translation functions,)SH /Times-Italic SF 32198 XM (f)SH /Times-Roman SF (, is given by:)SH /Times-Italic SF 9300 57949 MT (f)SH /Times-Roman SF 10192 XM (=)SH /Times-Italic SF 11384 XM (tcs)SH /Symbol SF 13000 XM (\264)SH /Times-Italic SF 14026 XM (fcs)SH /Symbol SF 15642 XM (\264)SH /Times-Roman SF 16668 XM (2)SH /Times-Bold SF 50675 XM (\0502\051)SH /Times-Roman SF 7200 59534 MT (That is, one function in) 144 W( each direction, for each combination of TCS and FCS. While this number is sig-)145 W 7200 60831 MT (nificantly lower than the number of pairs of all character sets \050Equation 1\051, we still want to reduce it to con-)78 W 7200 62128 MT (serve computer memory and cut down on user confusion.)SH 12.5 /Times-Bold AF 7200 64267 MT (The Transfer Character Set)SH 10.5 /Times-Roman AF 7200 65564 MT (Generally, we want to support all the file character sets) 50 W( used on a particular computer, so the way to keep the)51 W 7200 66861 MT (total number of translation functions small is to minimize the number of transfer) 27 W( character sets that can handle)26 W 7200 68158 MT (the given selection of file character sets. This is done, in part, by restricting the set) 191 W( of)192 W /Times-Italic SF 46897 XM (possible)SH /Times-Roman SF 50793 XM (transfer)SH 7200 69455 MT (character sets to a small number according to the following rules:)SH ES %%Page: 9 9 BS 0 SI 10 /Times-Roman AF 30350 4286 MT (9)SH 9.5 /Courier AF 10050 7793 MT (COMPUTER A) SH( COMPUTER) 17670 W( B)SH 7200 8783 MT (+------------------+ +------------------+)11970 W 7200 9773 MT (| +-------------+ |) SH( |) 11970 W( +-------------+) 570 W( |)SH 7200 10763 MT (| | Translation | |) SH( Transfer) 2850 W( |) 3990 W( |) 570 W( Translation | |)SH 7200 11753 MT (| | Function:) SH( |--------------------------->|) 1140 W( Function:) SH( |) 1140 W( |)SH 7200 12743 MT (| | FCS to TCS | |) SH( Character) 1710 W( Set) SH( |) 2280 W( |) 570 W( TCS to FCS | |)SH 7200 13733 MT (| +-------------+ |) SH( |) 11970 W( +-------------+) 570 W( |)SH 7200 14723 MT (| ^) 3420 W( |) 5130 W( |) 11970 W( |) 3990 W( |)4560 W 7200 15713 MT (| |) 3420 W( |) 5130 W( |) 11970 W( v) 3990 W( |)4560 W 7200 16703 MT (| Kermit) 570 W( Program |) SH( |) 11970 W( Kermit) 570 W( Program |)SH 7200 17693 MT (| SEND) 2850 W( |) 3990 W( |) 11970 W( RECEIVE) 2280 W( |)2850 W 7200 18683 MT (+------------------+ +------------------+)11970 W 11760 19673 MT (^ |)23370 W 11760 20663 MT (| v)23370 W 7200 21653 MT (+------------------+ +------------------+)11970 W 7200 22643 MT (| Local) 570 W( File) SH( |) 2850 W( |) 11970 W( Local) 570 W( File) SH( |)2850 W 7200 23633 MT (| Character) 570 W( Set A |) SH( |) 11970 W( Character) 570 W( Set B |)SH 7200 24623 MT (+------------------+ +------------------+)11970 W /Times-Bold SF 7200 26273 MT (Figure 2:)SH /Times-Roman SF 11396 XM (File Transfer Character-Set Translation)SH 10.5 SS 7200 29993 MT (1.)SH 8796 XM (The transfer character set must be a national or international standard character set registered with the ISO,)SH 8796 31290 MT (or a combination of such sets. This means that its structure is consistent with other standard character sets)SH 8796 32587 MT (and that it has a unique identifier. Furthermore, it means that the character set is well known, its specifica-)SH 8796 33884 MT (tion is readily available, and the characters have names.)SH 7200 35901 MT (2.)SH 8796 XM (US ASCII [1] is included for compatibility with the original Kermit protocol and with unextended Kermit)SH 8796 37198 MT (programs.)SH 7200 39215 MT (3.)SH 8796 XM (The ISO 8859 Latin Alphabets [18] are included.)SH 7200 41232 MT (4.)SH 8796 XM (The ISO-registered Chinese [4], Japanese [22, 23, 24], and Korean [25] sets are included. These are)SH 8796 42529 MT (usually used in conjunction with one or more single-byte sets that provide control characters and)SH 8796 43826 MT (single-width ASCII or ISO 646 graphics.)SH 7200 45843 MT (5.)SH 8796 XM (Additional sets, such as \050for example\051 Vietnamese VSCII [8, 28], can be included if they are registered)SH 8796 47140 MT (with the ISO, as long as they are not proper subsets of any of those already included.)SH 7200 49157 MT (6.)SH 8796 XM (All else being equal, a simple and compact representation is preferred.)SH 8850 51174 MT (The national versions of ISO 646) 2 W( [14]) SH( \050other than US ASCII\051 are) 2 W( not included because of Rule 5; these sets)1 W 7200 52471 MT (are covered adequately by the ISO Latin alphabets. CCITT) 201 W( \050ITU-T\051 T.61) 202 W( [2],) SH( which represents accented)202 W 7200 53768 MT (characters exclusively by composition, is not included for reasons 1, 5, and 6.)SH 8850 55065 MT (Table 1 lists the transfer character sets presently allowed by) 4 W( the Kermit protocol. The Kermit Name allows)3 W 7200 56362 MT (uniform reference to these sets by Kermit software users.) 90 W( The) 444 W( requirement for ISO registration provides for)91 W 7200 57659 MT (unique and incontestible identifiers) 76 W( for Kermit's transfer character sets. The Kermit Designator is the means)75 W 7200 58956 MT (by which) 167 W( the sending Kermit program informs the receiver of the transfer character set, a key part of the)168 W 7200 60253 MT (presentation protocol. As noted earlier, however, ISO standards do not provide a) 39 W( single designator for a com-)38 W 7200 61550 MT (plete character set, but rather separate) 168 W( designators for its pieces. Thus Latin-1 is designated as `)169 W 50045 XM (`I6/100')SH 53387 XM (',)SH 7200 62847 MT (meaning that the left half \050G0\051 is ASCII and the right half \050G1\051 is `)44 W 35779 XM (`the Right-hand Part of Latin Alphabet 1.')44 W 53650 XM (')SH 7200 64144 MT (The C0 and C1) 45 W( control regions are not explicitly designated. The C0 region is assumed to be the normal AS-)46 W 7200 65441 MT (CII and ISO 646 control set, with format effectors used to) 17 W( delimit records and so on. The C1 set is assumed to)16 W 7200 66738 MT (be ISO 6429 to allow the use of character-set shifting functions such as SS2 and SS3) 140 W( [15],) SH( for example) 140 W( in)141 W 7200 68035 MT (Japanese EUC.)SH 8850 69332 MT (In the Kermit Designator, the initial letter `)62 W 27270 XM (`I')SH 28220 XM (' indicates ISO registration numbers for character sets,) 62 W( leav-)61 W 7200 70629 MT (ing open the possibility for other registration authorities. Japanese) 79 W( EUC \050Extended UNIX Code\051 is a special)80 W 7200 71926 MT (case, having three parts, chosen in preference) 128 W( to JIS X 0208 alone to allow the commonly used mixture of)127 W ES %%Page: 10 10 BS 0 SI 10 /Times-Roman AF 30100 4286 MT (10)SH 9.5 /Times-Bold AF 7200 7852 MT (Table 1:)SH /Times-Roman SF 11028 XM (Kermit Transfer Character Sets)SH 44373 70 7200 8740 LH BX1 /Times-Italic SF 22179 9789 MT (ISO)SH 7599 10849 MT (Kermit)SH 22179 XM (Registration)SH 27675 XM (Kermit)SH 7599 11909 MT (Name)SH 15257 XM (Standard)SH 22179 XM (Number)SH 27675 XM (Designator)SH 32749 XM (Languages)SH 44373 40 7200 12504 LH BX1 /Times-Roman SF 7599 13555 MT (ASCII)SH 15257 XM (ANSI X3.4)SH 24692 XM (6)SH /Times-Italic SF 27675 XM (\050none\051)SH /Times-Roman SF 32749 XM (English, Latin)SH 7599 15005 MT (LATIN1)SH 15257 XM (ISO 8859-1)SH 23742 XM (100)SH 27675 XM (I6/100)SH 32749 XM (Danish, Dutch, English, Faeroese, Finnish,)SH 32749 16065 MT (French, German, Icelandic, Irish, Italian, Nor-)SH 32749 17125 MT (wegian, Portuguese, Spanish, and Swedish.)SH 7599 18575 MT (LATIN2)SH 15257 XM (ISO 8859-2)SH 23742 XM (101)SH 27675 XM (I6/101)SH 32749 XM (Albanian, Czech, English, German, Hungarian,)SH 32749 19635 MT (Polish, Romanian, Croatian, Slovak, and)SH 32749 20695 MT (Slovene.)SH 7599 22145 MT (LATIN3)SH 15257 XM (ISO 8859-3)SH 23742 XM (109)SH 27675 XM (I6/109)SH 32749 XM (Afrikaans, Catalan, Dutch, English, Esperanto,)SH 32749 23205 MT (French, Galician, German, Italian, Maltese,)SH 32749 24265 MT (Spanish, and Turkish.)SH 7599 25715 MT (LATIN4)SH 15257 XM (ISO 8859-4)SH 23742 XM (110)SH 27675 XM (I6/110)SH 32749 XM (Danish, English, Estonian, Finnish, German,)SH 32749 26775 MT (Greenlandic, Lappish \050Sami\051, Latvian, Lith-)SH 32749 27835 MT (uanian, Norwegian, and Swedish.)SH 7599 29285 MT (LATIN5)SH 15257 XM (ISO 8859-9)SH 23742 XM (148)SH 27675 XM (I6/148)SH 32749 XM (Danish, Dutch, English, Faeroese, Finnish,)SH 32749 30345 MT (French, German, Irish, Italian, Norwegian, Por-)SH 32749 31405 MT (tuguese, Spanish, Swedish, and Turkish.)SH 7599 32855 MT (CYRILLIC)SH 15257 XM (ISO 8859-5)SH 23742 XM (144)SH 27675 XM (I6/144)SH 32749 XM (Bulgarian, Byelorussian, English, Macedonian,)SH 32749 33915 MT (Russian, Serbocroatian \050Serbian\051, and Ukrainian)SH 7599 35365 MT (ARABIC)SH 15257 XM (ISO 8859-6)SH 23742 XM (127)SH 27675 XM (I6/127)SH 32749 XM (Arabic)SH 7599 36815 MT (GREEK)SH 15257 XM (ISO 8859-7)SH 23742 XM (126)SH 27675 XM (I6/126)SH 32749 XM (Greek)SH 7599 38265 MT (HEBREW)SH 15257 XM (ISO 8859-8)SH 23742 XM (138)SH 27675 XM (I6/138)SH 32749 XM (Hebrew)SH 7599 39715 MT (KATAKANA)SH 15257 XM (JIS X 0201)SH 22791 XM (14, 13)SH 27675 XM (I14/13)SH 32749 XM (Japanese \050Roman and Katakana\051)SH 7599 41165 MT (JAPANESE-EUC)SH 15257 XM (JIS X 0201,)SH 22791 XM (14, 13)SH 27675 XM (I14/87/13)SH 32749 XM (Japanese \050Roman, Katakana, Hiragana, and)SH 15257 42225 MT (JIS X 0208)SH 24217 XM (87)SH 32749 XM (Kanji\051, English, Greek, Russian)SH 7599 43675 MT (CHINESE)SH 15257 XM (CS GB 2312-80)SH 24217 XM (58)SH 27675 XM (I55/58)SH 32749 XM (Chinese \050Roman, Phonetic, and Hanzi\051,)SH 32749 44735 MT (Japanese \050Roman, Katakana, Hiragana\051,)SH 32749 45795 MT (English, Greek, Russian)SH 7599 47245 MT (KOREAN)SH 15257 XM (KS C 5601)SH 23742 XM (149)SH 27675 XM (I6/149)SH 32749 XM (Korean \050Hangul, Hanja\051, Japanese \050Roman,)SH 32749 48305 MT (Katakana, Hiragana\051, Greek, Russian, English,)SH 32749 49365 MT (and others)SH 7599 50815 MT (VIETNAMESE)SH 15257 XM (TCVN 5712)SH 23742 XM (180)SH 27675 XM (I6/180)SH 32749 XM (Vietnamese)SH 44373 70 7200 51422 LH BX1 10.5 SS 7200 54542 MT (single-width and double-width characters. The registration numbers are) 88 W( listed in G0/G1/G2 order, so SS2 is)89 W 7200 55839 MT (required to shift between Kanji \050ISO 87\051 and Katakana \050ISO 13\051 in accordance with ISO 2022.)SH 8850 57136 MT (This notation is used in preference to, say, the name of the standard itself \050for example) 41 W( `)40 W 46389 XM (`ISO8859-1')SH 51656 XM ('\051 be-)40 W 7200 58433 MT (cause the same character) 49 W( set can be defined by more than one standard \050for example, Latin-1 and ECMA 94\051,)50 W 7200 59730 MT (and one standard can specify more than one character set \050e.g. ISO 646\051.)SH 8850 61027 MT (Using a standard international character set as the TCS, it is possible to transfer text written in a language)47 W 7200 62324 MT (other than English between unlike computers,) 55 W( and it is usually also possible to transfer text containing a mix-)56 W 7200 63621 MT (ture of languages. For example, text in Latin Alphabet 1 might contain a mixture) 181 W( of Italian, Norwegian,)180 W 7200 64918 MT (French, German, English, and Icelandic.)SH 8850 66215 MT (A particular Kermit program need not incorporate all the defined transfer character sets. In) 43 W( many cases, a)44 W 7200 67512 MT (single 8-bit set will suffice, such as LATIN1 for Western Europe, LATIN2 for Eastern European languages)112 W 7200 68809 MT (with Roman-based writing systems, CYRILLIC) 48 W( for Russia, and so on. Thus Equation 2 generally results in a)49 W 7200 70106 MT (comfortably small number.) 42 W( For) 345 W( example, an IBM PC that supports five Roman-alphabet code pages for Wes-)41 W 7200 71403 MT (tern European) 69 W( languages plus a Cyrillic code page can be used with two transfer character sets, LATIN1 and)70 W 7200 72700 MT (CYRILLIC for a total of 24 translation functions.)SH ES %%Page: 11 11 BS 0 SI 10 /Times-Roman AF 30100 4286 MT (11)SH 10.5 SS 8850 7920 MT (When a language) 133 W( is representable in more than one set from Table 1, as are English, German, Finnish,)132 W 7200 9217 MT (Turkish, Greek, Russian, etc., the character set highest on the list that adequately represents) 184 W( the language)185 W 7200 10514 MT (should be used.) 408 W( For) 1077 W( example, ASCII should be used for English. Within the ISO 8859 family,)407 W 7200 11811 MT (lower-numbered sets that contain all the characters of interest are preferred to higher-numbered) 14 W( sets containing)15 W 7200 13108 MT (the same characters.)SH 8850 14405 MT (This guideline maximizes the chance that any) 90 W( two particular Kermit programs will interoperate. For ex-)89 W 7200 15702 MT (ample, LATIN1 would be chosen for French, German, Italian, Spanish, Danish, Dutch, Swedish,) 30 W( etc; LATIN3)31 W 7200 16999 MT (for Turkish; JAPANESE-EUC) 7 W( for Japanese text that includes Kanji characters, KATAKANA for Japanese text)6 W 7200 18296 MT (that includes only Roman and Katakana characters, and so on.)SH 8850 19593 MT (If a file containing a mixture of languages, say English, Finnish, and Latvian, must) 27 W( be transferred, the user)28 W 7200 20890 MT (must find a) 40 W( transfer character set that can adequately represent all three languages, in this case Latin Alphabet)39 W 7200 22187 MT (4. For) 263 W( a mixture of Norwegian and Turkish, Latin-5 must be used, and so on.)SH 8850 23484 MT (The user can employ this flexibility to achieve useful effects. For example, since there is no) 60 W( requirement)61 W 7200 24781 MT (that a Cyrillic file be transferred using a Cyrillic transfer character set, the user can transliterate between) 15 W( Cyril-)14 W 7200 26078 MT (lic and Roman characters as part of the file transfer process.)SH 12.5 /Times-Bold AF 7200 28217 MT (The Translation Function)SH 10.5 /Times-Roman AF 7200 29514 MT (A typical) 173 W( Kermit program contains an)174 W /Times-Italic SF 24460 XM (m)SH /Symbol SF 25368 XM (\264)SH /Times-Italic SF 26094 XM (n)SH /Symbol SF 26769 XM (\264)SH /Times-Roman SF 27495 XM (2 matrix of translation functions, where)174 W /Times-Italic SF 45428 XM (m)SH /Times-Roman SF 46623 XM (is the number of)174 W 7200 30811 MT (supported file character sets,)5 W /Times-Italic SF 19497 XM (n)SH /Times-Roman SF 20290 XM (is the number of supported transfer character sets \050including the)5 W /Times-Italic SF 47405 XM (transparent)SH /Times-Roman SF 52571 XM (set,)SH 7200 32108 MT (which indicates that no translation is to be done\051, and there are two tables for each) 179 W( combination, one for)180 W 7200 33405 MT (sending and one for receiving. The translation function is selected when the user identifies the file) 6 W( and transfer)5 W 7200 34702 MT (character sets and then sends or receives a file.)SH 8850 35999 MT (The normal behavior of a particular function) 134 W( can be altered in several ways. The user can override its)135 W 7200 37296 MT (default translation goal, and when the goal is readability \050as it must be when translating from) 133 W( a larger to a)132 W 7200 38593 MT (smaller set\051, language-specific rules can be invoked.)SH 8850 39890 MT (The function itself can work by any combination of algorithm, translation table, exception list, and shame-)27 W 7200 41187 MT (less tricks. For example, an invertible) 8 W( translation between IBM Code Page 437 and Latin Alphabet 1 would be)7 W 7200 42484 MT (a simple) 90 W( indexing operation into a table, but a translation from Japanese EUC to the PC `)91 W 45921 XM (`Shift JIS')91 W 50260 XM (' code is)91 W 7200 43781 MT (normally accomplished by a tableless algorithm.) 112 W( Translation) 486 W( from Latin-1 to ASCII with German language)111 W 7200 45078 MT (rules could be done with a combination of table accesses and exception lists. To accomplish) 22 W( the desired trans-)23 W 7200 46375 MT (lation, each Kermit program needs to know:)SH /Symbol SF 7200 48512 MT (\267)SH /Times-Roman SF 8796 XM (The local file character set.)SH /Symbol SF 7200 50649 MT (\267)SH /Times-Roman SF 8796 XM (The transfer character set to be used.)SH /Symbol SF 7200 52786 MT (\267)SH /Times-Roman SF 8796 XM (The translation goal, invertibility or readability.)SH /Symbol SF 7200 54923 MT (\267)SH /Times-Roman SF 8796 XM (For readable translations, optionally, the language and a corresponding set of language-specific rules.)SH 7200 56940 MT (In most situations, some or all of these are implicit, and no particular efforts) 70 W( are required. To illustrate, sup-)69 W 7200 58237 MT (pose that you have an) 51 W( IBM PC on your desk, and the PC is connected to a Hewlett Packard \050HP\051 timesharing)52 W 7200 59534 MT (computer. The) 393 W( PC uses IBM code page 850 and the timesharing computer uses the HP Roman8 Set.) 65 W( In) 391 W( that)64 W 7200 60831 MT (case, your PC's file character set is always CP850,) 47 W( the timesharing computer's file character set is always HP)48 W 7200 62128 MT (Roman8, and the transfer character set is always Latin-1. These items can be set in your Kermit profiles, and)47 W 7200 63425 MT (the appropriate translations will always occur automatically.)SH 8850 64722 MT (On the other hand, suppose you must occasionally write) 33 W( some text in German and send it from your PC to)34 W 7200 66019 MT (another computer that supports only ASCII. In) 46 W( this case you would override your Kermit profiles by specify-)45 W 7200 67316 MT (ing a transfer character set of ASCII, which automatically activates the readability goal, and) 103 W( you might also)104 W 7200 68613 MT (choose to elect language-specific rules for German.)SH ES %%Page: 12 12 BS 0 SI 10 /Times-Roman AF 30100 4286 MT (12)SH 12.5 /Times-Bold AF 7200 8038 MT (Examples)SH 10.5 /Times-Roman AF 7200 9335 MT (Let's look at a few of many possible translation scenarios. Each one presents its own set of problems) 4 W( requiring)3 W 7200 10632 MT (decisions by the creator of the translation function, or by the user.)SH 7200 12649 MT (1.)SH 8796 XM (From a 7-bit set to a different 7-bit set, e.g. from the Spanish version of ISO 646 to ASCII \050or vice versa\051.)SH 8796 13946 MT (The two sets do not contain the same characters. Here we must choose between readability \050R\051 and inver-)SH 8796 15243 MT (tibility \050I\051. To achieve readability in the Spanish-to-ASCII direction, we strip diacritical marks \050n-tilde be-)SH 8796 16540 MT (comes simply n, and so on\051. To achieve invertibility, we make no translation at all.)SH 7200 18557 MT (2.)SH 8796 XM (From a 7-bit set to an 8-bit set. The 7-bit sets are usually ASCII or an ISO 646 national version. Often, all)SH 8796 19854 MT (the characters from the 7-bit set are also present in the 8-bit set, and there is no R-versus-I conflict. For ex-)SH 8796 21151 MT (ample: ASCII \050and most ISO 646 national variants\051 to Latin-1\320here we satisfy both R and I. In other)SH 8796 22448 MT (cases we must choose between R and I. For example: the ISO 646 Italian national variant to ISO Latin /)SH 8796 23745 MT (Arabic: here we either remove the accents for readability, or map the accented characters into right-half)SH 8796 25042 MT (characters for invertibility.)SH 7200 27059 MT (3.)SH 8796 XM (From an 8-bit set to another 8-bit set. A common case is converting between one of the corporate)SH 8796 28356 MT (`)SH 9046 XM (`extended ASCII')SH 16441 XM (' sets \050Digital, IBM, HP, Apple, NeXT, Data General\051 and ISO Latin-1. The two sets)SH 8796 29653 MT (share a large percentage of common characters. How do we handle the characters that differ? Again, we)SH 8796 30950 MT (must choose between R and I. To complicate matters, the IBM, Apple, and NeXT sets use the forbidden)SH 8796 32247 MT (C1 control-character area for graphic characters. To create an invertible translation in the absence of an of-)SH 8796 33544 MT (ficial corporate standard, we use the Simple Rule.)SH 7200 35561 MT (4.)SH 8796 XM (From an 8-bit set to a 7-bit set. For example, from Latin-1 to ASCII or to an ISO 646 national set. Here)SH 8796 36858 MT (we are forced to accept a large amount of information loss. We cannot possibly achieve invertibility, so)SH 8796 38155 MT (we aim for maximum readability, for example by removing diacritics or invoking language-specific rules.)SH 7200 40172 MT (5.)SH 8796 XM (From a single-byte character set to a multibyte character set. Most multibyte character sets include ASCII)SH 8796 41469 MT (and sometimes several other alphabets \050such as Greek and Cyrillic\051. Here we translate each character into)SH 8796 42766 MT (its equivalent, if it has one. When it doesn't, we must choose between R and I. For example, `)SH 48308 XM (`)SH /Times-Accent SF (S)SH /Times-Roman SF (')SH 49666 XM (' is not)SH 8796 44063 MT (found in JIS X 0208 so it can be mapped to `)SH 27512 XM (`O')SH 28870 XM (' for readability, or some unique value \050preferably one un-)SH 8796 45360 MT (assigned in JIS X 0208\051 for invertibility.)SH 7200 47377 MT (6.)SH 8796 XM (From a multibyte set to a single-byte set, for example Japanese JIS X 0208 into Latin-1 \050or Latin/Cyrillic,)SH 8796 48674 MT (Latin/Greek, or even ASCII\051. An invertible translation is clearly impossible. A readable translation would)SH 8796 49971 MT (require rendering Kanji ideograms phonetically or translating them into an entirely different language,)SH 8796 51268 MT (clearly beyond the scope of a character-set conversion scheme.)SH 7200 53285 MT (7.)SH 8796 XM (From one national multibyte set to another. These sets are for Chinese, Japanese, and Korean, and have a)SH 8796 54582 MT (very large number of characters\320ideograms, ASCII graphics, Greek, and Cyrillic characters\320in common.)SH 8796 55879 MT (They also have large blocks of unassigned character positions, so the characters they do not share in com-)SH 8796 57176 MT (mon \050such as the Chinese phonetic symbols that are absent from the standard Japanese set\051 can be assigned)SH 8796 58473 MT (to these areas to preserve invertibility.)SH 7200 60490 MT (No two programmers are likely make the same decisions) 48 W( and this will lead to inconsistent translations \050unless)49 W 7200 61787 MT (the Simple Rule is followed\051. This emphasizes the need for officially) 188 W( published translations between the)187 W 7200 63084 MT (private and standard sets. And as this list suggests, we also need) 98 W( translations between some of the standard)99 W 7200 64381 MT (sets themselves, for example Chinese and Korean. This need is addressed to some extent by the) 154 W( Unicode)153 W 7200 65678 MT (books [27], which include the mappings from various character sets to the Unicode set.)SH ES %%Page: 13 13 BS 0 SI 10 /Times-Roman AF 30100 4286 MT (13)SH 12.5 /Times-Bold AF 7200 8038 MT (Performance)SH 10.5 /Times-Roman AF 7200 9335 MT (Character-set translation in itself does not affect the performance of the Kermit file transfer) 114 W( protocol to any)115 W 7200 10632 MT (significant degree. The introduction of per-character translation introduces an) 104 W( extra table access or function)103 W 7200 11929 MT (call but the extra work is usually minimal. In general, the bottlenecks are elsewhere.)SH 8850 13226 MT (One of the strong points of the Kermit protocol is its ability to transfer 8-bit) 50 W( data in the 7-bit communica-)51 W 7200 14523 MT (tion environment. This is done) 110 W( using a single shift, or prefixing, technique in which each 8-bit character is)109 W 7200 15820 MT (stripped of its 8th bit and then prefixed by a special shift-indicating character.) 41 W( This) 346 W( results in negligible over-)42 W 7200 17117 MT (head for English and Western European text \050such as French, German, Italian\051.)SH 8850 18414 MT (But for text in `)69 W 15562 XM (`right-handed')SH 21528 XM (' languages like Russian, Greek, Hebrew, and Arabic,) 69 W( where text characters)68 W 7200 19711 MT (come predominantly from the right half of the character set, single shifts can result in up to 80% overhead.)111 W 7200 21008 MT (The situation is even worse for) 25 W( Japanese EUC, in which all Kanji bytes have their 8th bits set to 1, resulting in)24 W 7200 22305 MT (transmission overhead of 100% for pure Kanji text on a 7-bit connection.)SH 8850 23602 MT (Because 7-bit communication is still prevalent, Kermit's support for Greek,) 49 W( Cyrillic, Hebrew, Arabic, and)50 W 7200 24899 MT (Japanese text file transfer calls for a more efficient technique. This was accomplished by adding) 118 W( a locking)117 W 7200 26196 MT (shift mechanism to the) 42 W( Kermit protocol, allowing sequences of 8-bit characters to be transmitted in 7-bit form)43 W 7200 27493 MT (with shifting overhead applying to entire 8-bit sequences, rather than to each 8-bit) 118 W( character. Isolated 8-bit)117 W 7200 28790 MT (characters can still be transmitted using single shifts. These methods are very similar to) 2 W( those of ISO 2022, but)3 W 7200 30087 MT (without the risk of `)27 W 15552 XM (`loss-of-state')SH 21284 XM (' due to corruption or loss of) 27 W( the shift characters, and with the addition of the)26 W 7200 31384 MT (`)SH 7450 XM (`single-shift-1')SH 13650 XM (' mechanism lacking from ISO 2022.) 72 W( A) 409 W( combination of single and locking shifts can achieve)73 W 7200 32681 MT (maximum efficiency by using a lookahead technique. A detailed specification is given elsewhere [12].)SH 8850 33978 MT (The addition of locking shifts to the Kermit protocol increases the transfer) 45 W( efficiency on 7-bit connections)44 W 7200 35275 MT (for typical Cyrillic text by about 50% and for typical Kanji) 40 W( text by more than 90%, bringing these transfers to)41 W 7200 36572 MT (within the efficiency range of normal 7-bit ASCII transfers on the same connections.)SH 15.5 /Times-Bold AF 7200 39176 MT (Conclusions and Recommendations)SH 10.5 /Times-Roman AF 7200 40473 MT (File transfer) 181 W( character-set translation is an optional feature for Kermit programs, and is designed to inter-)180 W 7200 41770 MT (operate \050with, of course, no claim to correct translation\051 with) 86 W( Kermit programs that do not support it. As of)87 W 7200 43067 MT (this writing, translation of \050at least\051 Roman, Cyrillic, Hebrew,) 155 W( and Japanese text is supported by MS-DOS)154 W 7200 44364 MT (Kermit for the IBM) 100 W( PC) 101 W( [9],) SH( IBM mainframe Kermit for VM, MVS, and CICS) 101 W( [3],) SH( and C-Kermit for UNIX,)101 W 7200 45661 MT (VMS, OS/2, and other operating systems) 67 W( [6].) SH( Three) 397 W( basic commands were added to) 67 W( these programs to select)66 W 7200 46958 MT (the file and transfer character sets and any desired language-specific translation rules. Locking shifts) 36 W( are used)37 W 7200 48255 MT (automatically in) 88 W( the 7-bit communication environment to prevent Kermit from discriminating against `)87 W 51316 XM (`right-)SH 7200 49552 MT (handed')SH 10482 XM (' character sets.) 54 W( Kermit) 372 W( programs equipped with the new translation features have become popular in)55 W 7200 50849 MT (Europe, Latin America, the former Soviet Union, and Japan. Work is in) 116 W( progress to add further translation)115 W 7200 52146 MT (capabilities for other parts of the world.)SH 8850 53443 MT (Space has not permitted discussion of the details of the Kermit) 8 W( protocol, the forms of the commands, trans-)9 W 7200 54740 MT (lation negotiation and refusal mechanisms, unilateral and local translation features, automatic matching of file)44 W 7200 56037 MT (and transfer character sets, character sets) 141 W( in terminal emulation, and numerous other issues. These will be)142 W 7200 57334 MT (covered in a future edition of reference [5].)SH 8850 58631 MT (In the discussions) 78 W( that resulted in the character-set translation extension to Kermit, the most fundamental)77 W 7200 59928 MT (lesson we have learned is that if) 28 W( existing standards can solve a particular problem, they should be used instead)29 W 7200 61225 MT (of inventing new techniques to solve the same problem. Applying this lesson to Kermit file transfer results in)38 W 7200 62522 MT (the following conclusions:)SH 7200 64539 MT (1.)SH 8796 XM (Only ISO-registered standard character sets should be used for interchange. This eliminates the need for)SH 8796 65836 MT (any computer to support any character sets except its own and the corresponding well-known standards.)SH 7200 67853 MT (2.)SH 8796 XM (The sender should identify the transfer character set to the receiver using a standard notation such as its)SH 8796 69150 MT (ISO registration number. This eliminates the need for setting up separate and redundant registration au-)SH 8796 70447 MT (thoritities for character-set identifiers.)SH ES %%Page: 14 14 BS 0 SI 10 /Times-Roman AF 30100 4286 MT (14)SH 10.5 SS 7200 7920 MT (3.)SH 8796 XM (Translations should be invertible, readable, and consistent. When all three of these goals cannot be ach-)SH 8796 9217 MT (ieved by a single translation, the user should be able to choose the translation goal.)SH 7200 11234 MT (These principles can be applied to any form of textual data interchange,) 19 W( including electronic mail, network file)20 W 7200 12531 MT (systems, terminal emulation, virtual terminal service, distributed databases, remote procedure) 9 W( calls, cutting and)8 W 7200 13828 MT (pasting among object-oriented applications, and so on.)SH 8850 15125 MT (The translation process itself, however, remains ill-defined.) 34 W( It) 332 W( is hoped that the industry and the standards)35 W 7200 16422 MT (organizations will take the following steps:)SH 7200 18439 MT (1.)SH 8796 XM (Standard character sets should be used in preference to private character sets.)SH 7200 20456 MT (2.)SH 8796 XM (Owners of private character sets should publish official invertible translations to ISO-registered standard)SH 8796 21753 MT (sets. In) 263 W( the absence of official translations, a simple procedure such as the one presented in this paper)SH 8796 23050 MT (should be used to achieve consistent invertible translations across all applications.)SH 7200 25067 MT (3.)SH 8796 XM (Standards organizations are encouraged to consider publishing translations between different standard)SH 8796 26364 MT (character sets, such as the Japanese, Chinese, and Korean sets, as well as readable transliterations among)SH 8796 27661 MT (different alphabetic writing systems, such as Roman, Greek, Hebrew, Arabic, and Cyrillic.)SH 7200 29678 MT (4.)SH 8796 XM (Operating system designers should consider tagging plain-text files with character-set identifiers, like the)SH 8796 30975 MT (Kermit tags listed in Table 1, to allow applications software to determine a file's character set automati-)SH 8796 32272 MT (cally. When) 263 W( standard character sets are used, their tags should be consistent across different operating sys-)SH 8796 33569 MT (tems.)SH 8850 35586 MT (Ten or twenty years from now, perhaps all the computers, as well as all the display, entry, printing, and) 6 W( tel-)5 W 7200 36883 MT (communication devices of the) 21 W( world will use one universal character set, and the issues discussed in this paper)22 W 7200 38180 MT (will be irrelevant.) 126 W( On) 513 W( the other hand, perhaps the accumulated and ever-growing installed base of existing)125 W 7200 39477 MT (hardware, software, and) 9 W( electronic information will prove too massive for conversion and the universal charac-)10 W 7200 40774 MT (ter set will be just one more character set on the list.)SH 15.5 /Times-Bold AF 7200 43378 MT (Acknowledgements)SH 10.5 /Times-Roman AF 7200 44675 MT (My deepest thanks to Christine M. Gianone of) 18 W( Columbia University for inspiring the work and ideas described)17 W 7200 45972 MT (in this paper, and for her) 94 W( key contributions thereto. Special thanks also to the others who played prominent)95 W 7200 47269 MT (roles in the) 41 W( design and development of Kermit's character-set translation capabilities: Joe R. Doupnik of Utah)40 W 7200 48566 MT (State University, John Chandler of the Harvard) 100 122 WX(/) 100 MX(Smithsonian Center for) 122 W( Astrophysics, Hirofumi Fujii of the)123 W 7200 49863 MT (Japan National Laboratory of) 62 W( High Energy Physics in Tokyo, John Klensin of the United Nations University,)61 W 7200 51160 MT (Andr)SH /Times-Accent SF (h)SH /Times-Roman SF 10129 XM (Pirard of the University of Li)42 W /Times-Accent SF (k)SH /Times-Roman SF (ge in Belgium, Johan van Wingen of the Netherlands and numerous) 42 W( ISO)43 W 7200 52457 MT (committees, Gisbert W. Selke of the Wissenschaftliches Institut der Ortskrankenkassen in) 4 W( Bonn, Germany, and)3 W 7200 53754 MT (Konstantin Vinogradov of the International Centre) 88 W( for Scientific and Technical Information \050ICSTI\051 in Mos-)89 W 7200 55051 MT (cow. Grateful) 455 W( acknowledgements also to Juri Gornostaev and A. Butrimenko) 96 W( of ICSTI for hosting the First)95 W 7200 56348 MT (International Kermit Conference in Moscow) 73 W( [11]) SH( in Spring 1989, where the ideas in this paper) 73 W( received their)74 W 7200 57645 MT (first public hearing. Thanks also to the many participants in the ISO8859, UNICODE,) 21 W( and ISO10646 network)20 W 7200 58942 MT (discussion groups for valuable information and insights.)SH 15.5 /Times-Bold AF 7200 61546 MT (About the Author)SH 10.5 /Times-Italic AF 7200 63416 MT (FRANK DA CRUZ)99 W /Times-Roman SF 15696 XM (is Manager) 99 W( of Communication Software Development at Columbia University, author of)100 W 7200 64713 MT (the book)297 W /Times-Italic SF 11703 XM (Kermit, A File Transfer) 297 W( Protocol)296 W /Times-Roman SF (, co-author \050with Christine M. Gianone\051 of the book)296 W /Times-Italic SF 51492 XM (Using)SH 7200 66010 MT (C-Kermit)SH /Times-Roman SF (, leader of the team that developed the Kermit protocol, and principal author of several) 213 W( Kermit)214 W 7200 67307 MT (software programs including C-Kermit for UNIX, VMS, and OS/2. Present address: Columbia University,) 1 W( 612)SH 7200 68604 MT (West 115th Street, New York, NY 10025, USA; Email: fdc@columbia.edu.)SH ES %%Page: 15 15 BS 0 SI 10 /Times-Roman AF 30100 4286 MT (15)SH 15.5 /Times-Bold AF 7200 8239 MT (References)SH 10.5 SS 7200 10109 MT (1.)SH /Times-Italic SF 8514 XM (ANSI X3.4-1986, Code for Information Interchange.)SH /Times-Roman SF 31297 XM (American National Standards Institute, 1986. The)SH 7200 11259 MT (ASCII specification; the US version of ISO 646.)SH /Times-Bold SF 7200 13127 MT (2.)SH /Times-Italic SF 8514 XM (CCITT Recommendation T.61, Character Repertoire and Coded Character Sets for the International)SH 7200 14277 MT (Teletex Service.)SH /Times-Roman SF 14636 XM (CCITT, Geneva, 1980 \050amended 1984\051.)SH /Times-Bold SF 7200 16147 MT (3.)SH /Times-Roman SF 8514 XM (Chandler, John.)SH /Times-Italic SF 15398 XM (IBM System/370 Kermit User's Guide.)SH /Times-Roman SF 32431 XM (Columbia University Academic Information Sys-)SH 7200 17297 MT (tems, 1993. Available in separate versions for VM/CMS, MVS/TSO, and CICS.)SH /Times-Bold SF 7200 19167 MT (4.)SH /Times-Italic SF 8514 XM (Chinese Standard GB 2312-80, Coded Chinese Graphic Character Set for Information Interchange.)SH /Times-Roman SF 51389 XM (China)SH 7200 20317 MT (Association for Standardization, Beijing, 1980.)SH /Times-Bold SF 7200 22187 MT (5.)SH /Times-Roman SF 8514 XM (da Cruz, Frank.)SH /Times-Italic SF 15574 XM (Kermit, A File Transfer Protocol.)SH /Times-Roman SF 30219 XM (Digital Press, Bedford, MA, 1987.)SH /Times-Bold SF 7200 24057 MT (6.)SH /Times-Roman SF 8514 XM (da Cruz, Frank and Christine Gianone.)SH /Times-Italic SF 25286 XM (Using C-Kermit.)SH /Times-Roman SF 32812 XM (Digital Press, Burlington, MA, 1993.)SH 7200 25207 MT (EY-J896E-DP, German edition available Fall 1993.)SH /Times-Bold SF 7200 27077 MT (7.)SH /Times-Roman SF 8514 XM (Data General.)SH /Times-Italic SF 14608 XM (Programming the Display Terminal: Models D217, D413, and D463.)SH /Times-Roman SF 44595 XM (Data General,)SH 7200 28227 MT (Westboro, MA, 1991. 014-002111-00.)SH /Times-Bold SF 7200 30097 MT (8.)SH /Times-Roman SF 8514 XM (Do, James, Ng)SH /Times-Accent SF (r)SH /Times-Roman SF 15485 XM (Thanh Nh)SH /Times-Accent SF (d)SH /Times-Roman SF (n, Ho)SH /Times-Accent SF (d)SH /Times-Roman SF (ng Nguy)SH /Times-Accent SF (i)SH /Times-Roman SF (n. "A) 263 W( proposal for Vietnamese character encoding standards in)SH 7200 31247 MT (a unified text processing framework".)SH /Times-Italic SF 23582 XM (Computer Standards & Interfaces 14)SH /Times-Roman SF 39419 XM (\0501992\051.)SH /Times-Bold SF 7200 33117 MT (9.)SH /Times-Roman SF 8514 XM (Gianone, Christine M.)SH /Times-Italic SF 18432 XM (Using MS-DOS Kermit.)SH /Times-Roman SF 28962 XM (Digital Press, Burlington, MA, 1992. EY-H893E-DP, Also)SH 7200 34267 MT (available in French and German editions.)SH /Times-Bold SF 7200 36137 MT (10.)SH /Times-Roman SF 9039 XM (Gianone, Christine M. "Have Kermit, Will Travel".)SH /Times-Italic SF 31452 XM (Kermit News 3)SH /Times-Roman SF (, 1 \050June 1988\051.)SH /Times-Bold SF 7200 38007 MT (11.)SH /Times-Roman SF 9039 XM (Gianone, Christine M. "Mission to Moscow".)SH /Times-Italic SF 28943 XM (Kermit News)SH /Times-Roman SF 34659 XM (, 4 \050June 1990\051.)SH /Times-Bold SF 7200 39877 MT (12.)SH /Times-Roman SF 9039 XM (Gianone, Christine M. and Frank da Cruz. A Locking Shift Mechanism for the Kermit File Transfer)SH 7200 41027 MT (Protocol. Columbia) 263 W( University, 1991.)SH /Times-Bold SF 7200 42897 MT (13.)SH /Times-Italic SF 9039 XM (IBM Character Data Representation Architecture, Level 1 Registry.)SH /Times-Roman SF 38380 XM (IBM Canada Ltd., National Lan-)SH 7200 44047 MT (guage Technical Centre, Ontario, 1990. SC09-1391-00.)SH /Times-Bold SF 7200 45917 MT (14.)SH /Times-Italic SF 9039 XM (ISO Standard 646, 7-Bit Coded Character Set for Information Processing Interchange.)SH /Times-Roman SF 46462 XM (Second edition,)SH 7200 47067 MT (International Organization for Standardization, 1983. Also available as ECMA-6, and similar to CCITT T.50.)SH /Times-Bold SF 7200 49057 MT (15.)SH /Times-Italic SF 9039 XM (ISO International Standard 2022, Information processing)SH /Times-Roman SF 33570 XM (\320)SH /Times-Italic SF 34883 XM (ISO 7-bit and 8-bit coded character sets)SH /Times-Roman SF 52062 XM (\320)SH /Times-Italic SF 7200 50207 MT (Code extension techniques.)SH /Times-Roman SF 19449 XM (Third edition, International Organization for Standardization, 1986. Also avail-)SH 7200 51357 MT (able as ECMA-35.)SH /Times-Bold SF 7200 53347 MT (16.)SH /Times-Italic SF 9039 XM (ISO International Standard ISO 2375, Information processing)SH /Times-Roman SF 35466 XM (\320)SH /Times-Italic SF 36779 XM (Procedure for Registration of Escape)SH 7200 54497 MT (Sequences.)SH /Times-Roman SF 12624 XM (International Organization for Standardization, 1985.)SH /Times-Bold SF 7200 56487 MT (17.)SH /Times-Italic SF 9039 XM (ISO International Standard 4873, Information processing)SH /Times-Roman SF 33570 XM (\320)SH /Times-Italic SF 34883 XM (ISO 8-bit code for information interchange)SH /Times-Roman SF 7200 57637 MT (\320)SH /Times-Italic SF 8513 XM (Structure and rules for implementation.)SH /Times-Roman SF 25956 XM (Second edition, International Organization for Standardization,)SH 7200 58787 MT (1986. Also) 263 W( available as ECMA-43.)SH /Times-Bold SF 7200 60777 MT (18.)SH /Times-Italic SF 9039 XM (ISO International Standard 8859 Parts 1 through 9, Information Processing)SH /Times-Roman SF (\320)SH /Times-Italic SF (8-Bit Single-Byte Coded)SH 7200 61927 MT (Graphic Character Sets.)SH /Times-Roman SF 18283 XM (International Organization for Standardization, 1987\261. ISO 8859-1 through -4 are)SH 7200 63077 MT (the Latin Alphabets 1 through 4, also available as ECMA-94. ISO 8859-5 is the Latin/Cyrillic Alphabet)SH 7200 64227 MT (\050ECMA 113\051.)SH /Times-Bold SF 7200 66217 MT (19.)SH /Times-Italic SF 9039 XM (ISO/IEC 10646-1, International Standard 10646, Information Technology)SH /Times-Roman SF (\320)SH /Times-Italic SF (Univesral Multiple-Octet)SH 7200 67367 MT (Coded Character Set \050UCS\051.)SH /Times-Roman SF 20063 XM (ISO/IEC JTC1, 1993.)SH /Times-Bold SF 7200 69357 MT (20.)SH /Times-Italic SF 9039 XM (ISO International Standard 6429, Information processing)SH /Times-Roman SF 33570 XM (\320)SH /Times-Italic SF 34883 XM (C1 Control Character Set of ISO 6429.)SH /Times-Roman SF 7200 70507 MT (International Organization for Standardization, 1983.)SH ES %%Page: 16 16 BS 0 SI 10 /Times-Roman AF 30100 4286 MT (16)SH 10.5 /Times-Bold AF 7200 7920 MT (21.)SH /Times-Italic SF 9039 XM (ISO International Register of Coded Characters to Be Used with Escape Sequences.)SH /Times-Roman SF 45234 XM (European Computer)SH 7200 9070 MT (Manufacturers Association \050ECMA\051, 1990, updated periodically.)SH /Times-Bold SF 7200 10940 MT (22.)SH /Times-Italic SF 9039 XM (JIS X 0201, The Japanese Katakana and Roman Set of Characters.)SH /Times-Roman SF 38004 XM (Japan Industrial Standards Com-)SH 7200 12090 MT (mittee, 1969.)SH /Times-Bold SF 7200 13960 MT (23.)SH /Times-Italic SF 9039 XM (JIS X 0208, The Japanese Graphic Character Set for Information Interchange.)SH /Times-Roman SF 42960 XM (Japan Industrial Stan-)SH 7200 15110 MT (dards Committee, 1983.)SH /Times-Bold SF 7200 16980 MT (24.)SH /Times-Italic SF 9039 XM (JIS X 0212, Supplementary Japanese Graphic Character Set for Information Interchange.)SH /Times-Roman SF 47683 XM (Japan National)SH 7200 18130 MT (Committee on ISO/IEC JTC1/SC2, 1991.)SH /Times-Bold SF 7200 20000 MT (25.)SH /Times-Italic SF 9039 XM (Korean Standard KS C 5601-1987, Korean Graphic Character Set for Information Interchange.)SH /Times-Roman SF 50282 XM (Korea)SH 7200 21150 MT (Bureau of Standards, 1987.)SH /Times-Bold SF 7200 23020 MT (26.)SH /Times-Roman SF 9039 XM (Padlipsky, M. A.)SH /Times-Italic SF 16713 XM (The Elements of Networking Style.)SH /Times-Roman SF 31735 XM (Prentice Hall, 1985.)SH /Times-Bold SF 7200 24890 MT (27.)SH /Times-Roman SF 9039 XM (The Unicode Consortium.)SH /Times-Italic SF 20503 XM (The Unicode Standard, Worldwide Character Encoding, Version 1.0.)SH /Times-Roman SF 7200 26040 MT (Addison-Wesley Publishing Company, Volume 1, 1991; Volume 2, 1992.)SH /Times-Bold SF 7200 27910 MT (28.)SH /Times-Roman SF 9039 XM (Viet Nam General Department for Standardization.)SH /Times-Italic SF 30825 XM (Vietnamese National Standard TCVN 5712, 8-bit)SH 7200 29060 MT (Vietnamese Standard Code for Information Interchange \050VSCII\051.)SH /Times-Roman SF 35318 XM (Viet Nam State Committee for Sciences,)SH 7200 30210 MT (1993.)SH ES %%Trailer %%Pages: 16 %%DocumentFonts: Times-Roman Times-Bold Times-Accent Times-Italic Courier Courier-Accent Symbol CyrillicGothic