dtdocbook: Migrate doc2sdl to UTF-8.

2025-02-15 04:32:24 +00:00 · 2021-05-21 05:06:38 +08:00 · 2021-05-21 05:06:38 +08:00 · 6f952545fb
commit 6f952545fb
parent f2623acbe7
3 changed files with 72 additions and 29 deletions
--- a/cde/programs/dtdocbook/doc2sdl/docbook.sgml
+++ b/cde/programs/dtdocbook/doc2sdl/docbook.sgml
@ -2,23 +2,20 @@

 			       CHARSET

-         BASESET  "ISO 646:1983//CHARSET
-                   International Reference Version (IRV)//ESC 2/5 4/0"
-         DESCSET
-                    0   9   UNUSED
-                    9   2     9
-                   11   2   UNUSED
-                   13   1    13
-                   14  18   UNUSED
-                   32  95    32
-                  127   1   UNUSED
-
-         BASESET  "ISO Registration Number 100//CHARSET
-                   ECMA-94 Right Part of Latin Alphabet Nr. 1//ESC 2/13 4/1"
-         DESCSET  
-                  128  32   UNUSED
-                  160  96   32
-
+         BASESET  "ISO Registration Number 177//CHARSET
+                   ISO/IEC 10646-1:1993 UCS-4 with
+                   implementation level 3//ESC 2/5 2/15 4/6"
+         DESCSET 0     9       UNUSED
+                 9     2       9
+                 11    2       UNUSED
+                 13    1       13
+                 14    18      UNUSED
+                 32    95      32
+                 127   1       UNUSED
+                 128   32      UNUSED
+                 160   55136   160
+                 55296 2048    UNUSED  -- SURROGATES --
+                 57344 1056768 57344

 			   CAPACITY SGMLREF

--- a/cde/programs/dtdocbook/doc2sdl/docbook.tcl
+++ b/cde/programs/dtdocbook/doc2sdl/docbook.tcl
@ -77,6 +77,32 @@ set snbLocation 0
 # EXAMPLE, for instance, we need to default to TYPE="LITERAL"
 set defaultParaType ""

+proc TrimWhiteSpaces {str} {
+    set characters [split $str ""]
+
+    for {set i 0} {$i < 2} {incr i} {
+        set tmp ""
+        set trimmed 0
+
+        foreach c $characters {
+            if {$trimmed == 0} {
+                if {[string is ascii -strict $c]} {
+                    if {[string is space -strict $c]} {
+                        continue
+                    }
+                }
+            }
+
+            set trimmed 1
+
+            lappend tmp $c
+        }
+
+        set characters [lreverse $tmp]
+    }
+
+    return [join $characters ""]
+}

 # print internal error message and exit
 proc InternalError {what} {
@ -255,7 +281,7 @@ proc AddToMarkArray {mark} {
    global validMarkArray

    set m [string range $mark 1 6]
-    set m [string trim $m]
+    set m [TrimWhiteSpaces $m]

    set validMarkArray($m) $mark

@ -1987,13 +2013,13 @@ proc EndAGlossedTerm {id role} {
 	# trim whitespace from the front and back of the string to be
 	# glossed, also turn line feeds into spaces and compress out
 	# duplicate whitespace
-	set glossString [string trim $glossString]
+	set glossString [TrimWhiteSpaces $glossString]
 	set glossString [split $glossString '\n']
 	set tmpGlossString $glossString
 	set glossString [lindex $tmpGlossString 0]
 	foreach str [lrange $tmpGlossString 1 end] {
 	    if {$str != ""} {
-		append glossString " " [string trim $str]
+		append glossString " " [TrimWhiteSpaces $str]
 	    }
 	}

@ -2045,7 +2071,7 @@ proc EndATermInAGlossary {id} {
    set glossString [lindex $tmpGlossString 0]
    foreach str [lrange $tmpGlossString 1 end] {
 	if {$str != ""} {
-	    append glossString " " [string trim $str]
+	    append glossString " " [TrimWhiteSpaces $str]
 	}
    }

@ -2347,7 +2373,7 @@ proc AddIndexEntry {loc} {

    # trim superfluous whitespace at the beginning and end of the
    # indexed term
-    set indexBuffer [string trim $indexBuffer]
+    set indexBuffer [TrimWhiteSpaces $indexBuffer]

    # get an array index and determine whether 1st, 2nd or 3rd level
    set index [join $indexVals ", "]
@ -2411,7 +2437,7 @@ proc EndIndexTerm {} {
 proc StartPrimaryIndexEntry {id cdata} {
    global indexVals

-    set indexVals [list [string trim $cdata]]
+    set indexVals [list [TrimWhiteSpaces $cdata]]
 }


@ -2425,7 +2451,7 @@ proc StartSecondaryIndexEntry {id cdata} {
    global indexVals

    AddIndexEntry "" ;# make sure our primary is defined
-    lappend indexVals [string trim $cdata]
+    lappend indexVals [TrimWhiteSpaces $cdata]
 }


@ -2439,7 +2465,7 @@ proc StartTertiaryIndexEntry {id cdata} {
    global indexVals

    AddIndexEntry "" ;# make sure our secondary is defined
-    lappend indexVals [string trim $cdata]
+    lappend indexVals [TrimWhiteSpaces $cdata]
 }


@ -4692,7 +4718,9 @@ proc IncludeTOSS {} {
 proc GetLocalizedAutoGeneratedStringArray {filename} {
    global localizedAutoGeneratedStringArray

-    set buffer [ReadLocaleStrings $filename]
+    set fp [open $filename]
+    set buffer [read $fp]
+    close $fp

    set regExp {^(".*")[	 ]*(".*")$} ;# look for 2 quoted strings

@ -4701,7 +4729,7 @@ proc GetLocalizedAutoGeneratedStringArray {filename} {
    set index 0
    while {$listLength > 0} {
 	set line [lindex $stringList $index]
-	set line [string trim $line]
+	set line [TrimWhiteSpaces $line]
 	if {([string length $line] > 0) && ([string index $line 0] != "#")} {
 	    if {[regexp $regExp $line match match1 match2]} {
 		set match1 [string trim $match1 \"]
--- a/cde/programs/dtdocbook/doc2sdl/dtdocbook
+++ b/cde/programs/dtdocbook/doc2sdl/dtdocbook
@ -1,5 +1,7 @@
 #!/bin/ksh

+export LC_CTYPE="${LANG}"
+
 # get the name of this command for errors, warnings and messages
 command_name=`basename $0`

@ -86,6 +88,7 @@ do
    esac
 done

+default_charset='UTF-8'
 dbk_lib="${dbk_lib:-/usr/dt/dthelp/dtdocbook}"  # if no -t, use installed dir
 sgml_dir="${sgml_dir:-${dbk_lib}}"              # if no -s, use -t
 info_dir="${info_dir:-/usr/dt/infolib}"         # if no -i, use installed dir
@ -99,12 +102,24 @@ if [[ ${#sgmls} -eq 0 ]] then                  # if no -S, use installed one
 fi
 sgmls="${sgmls:-${info_dir}/etc/sgmls}"         # if no -S, use installed one
 instant="${instant:-${dbk_lib}/instant}"        # if no -I, use installed one
-x_locale="${x_locale:-${dbk_lib}/xlate_locale}" # if no -L, use installed one
+x_locale="${x_locale:-${LANG}}"                 # if no -L, use installed one
 helptag2="${helptag2:-dthelp_htag2}"            # if no -H, use one in PATH

+if [[ "$x_locale" == *.* ]] then
+    x_lang="${x_locale%%.*}"
+    x_charset="${x_locale##*.}"
+
+    if [[ "$x_charset" != "$default_charset" ]] then
+        x_locale="${x_lang}.$default_charset"
+        echo "Warning: charset is changed to ${default_charset}."
+    fi
+else
+    x_locale="${x_locale}.$default_charset"
+fi
+
 # Set the environment variables for instant(1) to find its files
 export TPT_LIB="${dbk_lib}"
-export LOCALE_DIR="${dbk_lib}/$($x_locale)"
+export LOCALE_DIR="${dbk_lib}/${x_locale}"

 # Determine whether we are using sgmls or nsgmls
 parser=`basename $sgmls`
@ -123,6 +138,9 @@ elif ([[ "$SGML_CATALOG_FILES" = "" ]]) then
 	export SGML_CATALOG_FILES="${SGML_CATALOG_FILES}:${sgml_cat}/catalog"
 fi

+export SP_CHARSET_FIXED=1
+export SP_ENCODING="$default_charset"
+
 # Set the environment variable to be picked up inside instant(1) when it
 # goes to call Tcl.
 export DBKTCL_DIR="${dbk_lib}/"