File html2text-debian-500_utf8_support.patch of Package html2text

Overview Repositories Revisions Requests Users Attributes Meta

File html2text-debian-500_utf8_support.patch of Package html2text

Support UTF-8 encoding when processing input.
Index: html2text-1.3.2a/Area.C
===================================================================
--- html2text-1.3.2a.orig/Area.C	2008-09-20 14:01:44.259190763 +0300
+++ html2text-1.3.2a/Area.C	2008-09-20 14:06:15.255782998 +0300
@@ -36,10 +36,13 @@
 #include <iostream>
 
 #include "Area.h"
+#include "html.h"
 #include "string.h"
 
 #define LATIN1_nbsp 160
 
+extern int use_encoding;
+
 /* ------------------------------------------------------------------------- */
 
 #define malloc_array(type, size)\
@@ -81,6 +84,53 @@
 
 /* ------------------------------------------------------------------------- */
 
+/*           utf_length() and utf_width()       
+ *
+ *     Very simplified algorithm of calculating length of UTF-8
+ *   string. No check for errors. Counting only ASCII bytes and
+ *   leading bytes of UTF-8 multibyte sequences. All bytes like
+ *   10xxxxxx are dropped. If USE_UTF8 is false then returns
+ *   usual length.               --YS
+ */
+
+size_t utf8_aux_count(char ch)
+{
+	if((ch & 0xe0) == 0xc0)
+	{
+		return 1;
+	}
+	else if((ch & 0xf0) == 0xe0)
+	{
+		return 2;
+	}
+	else if ((ch & 0xf8) == 0xf0)
+	{
+		return 3;
+	}
+	else
+	{
+		return 0;
+	}
+}
+
+unsigned int
+Line::utf_length(size_type f, size_type t) const
+{
+	size_type m = (t < length_ ? t : length_);
+	size_type r = m - f;
+	if(USE_UTF8)
+	{
+		for (int i = f; i < m; i++)
+		{
+			char& ch = cells_[i].character;
+			size_type aux_count = utf8_aux_count(ch);
+			r -= aux_count;
+			i += aux_count;
+		}
+	}
+	return r;
+}
+
 void
 Line::resize(size_type l)
 {
@@ -236,6 +286,28 @@
   return *this;
 }
 
+unsigned int
+Area::utf_width()
+{
+  size_type r = width_;
+  if(USE_UTF8) { r = 0;
+    for (size_type yy = 0; yy < height_; yy++) {
+	  int i = width_ - 1;
+      while((i >= 0) && isspace(cells_[yy][i].character))
+	  {
+		  --i;
+	  }
+      size_type aux_count_sum = 0;
+      for (; i >= 0; i--) {
+		aux_count_sum += utf8_aux_count(cells_[yy][i].character);
+      }
+	  size_type r1 = width_ - aux_count_sum;
+      if(r < r1) r = r1;
+    }
+  }
+  return r;
+}
+
 void
 Area::resize(size_type w, size_type h)
 {
@@ -439,7 +511,7 @@
       char c = p->character;
       char a = p->attribute;
 
-      if (c == (char) LATIN1_nbsp) c = ' ';
+      if (c == (char) LATIN1_nbsp && !USE_UTF8) c = ' ';
 
       if (a == Cell::NONE) {
         os << c;
Index: html2text-1.3.2a/Area.h
===================================================================
--- html2text-1.3.2a.orig/Area.h	2008-09-20 14:01:44.295185701 +0300
+++ html2text-1.3.2a/Area.h	2008-09-20 14:06:15.255782998 +0300
@@ -81,6 +81,8 @@
   Cell       &operator[](size_type x)       { return cells_[x]; }
   const Cell *cells() const { return cells_; }
 
+  unsigned int utf_length(size_type f, size_type t) const;
+
   void resize(size_type l);
   void enlarge(size_type l) { if (l > length_) resize(l); }
 
@@ -134,6 +136,8 @@
   Cell       *operator[](size_type y)       { return cells_[y]; }
   const Area &operator>>=(size_type rs);
 
+  unsigned int utf_width();
+
   void resize(size_type w, size_type h);
   void enlarge(size_type w, size_type h);
 
Index: html2text-1.3.2a/format.C
===================================================================
--- html2text-1.3.2a.orig/format.C	2008-09-20 14:01:44.311190459 +0300
+++ html2text-1.3.2a/format.C	2008-09-20 14:06:15.259781132 +0300
@@ -1210,6 +1210,7 @@
     }
 
     Line::size_type to = from + 1;
+    int to_from;
 
     Line::size_type lbp = (Line::size_type) -1; // "Last break position".
 
@@ -1238,18 +1239,20 @@
         to++;
       }
 
-      if (to - from > w && lbp != (Area::size_type) -1) { to = lbp; break; }
+      if (line.utf_length(from,to) > w && lbp != (Area::size_type) -1) 
+                    { to = lbp; break; }
     }
 
+    to_from = line.utf_length(from,to);
     /*
      * Copy the "from...to" range from the "line" to the bottom of the "res"
      * Area.
      */
     Area::size_type x = 0;
     Area::size_type len = to - from;
-    if (halign == Area::LEFT || len >= w) { ;                   } else
-    if (halign == Area::CENTER)           { x += (w - len) / 2; } else
-    if (halign == Area::RIGHT)            { x += w - len;       }
+    if (halign == Area::LEFT || to_from >= w) { ;                   } else
+    if (halign == Area::CENTER)           { x += (w - to_from) / 2; } else
+    if (halign == Area::RIGHT)            { x += w - to_from;       }
     res->insert(line.cells() + from, len, x, res->height());
 
     /*
Index: html2text-1.3.2a/html2text.C
===================================================================
--- html2text-1.3.2a.orig/html2text.C	2008-09-20 14:03:39.181744957 +0300
+++ html2text-1.3.2a/html2text.C	2008-09-20 14:06:15.259781132 +0300
@@ -150,9 +150,10 @@
   -o <file>      Redirect output into <file>\n\
   -nobs          Do not use backspaces for boldface and underlining\n\
   -ascii         Use plain ASCII for output instead of ISO-8859-1\n\
+  -utf8          Assume both terminal and input stream are in UTF-8 mode\n\
 ";
 
-int use_iso8859 = 1;
+int use_encoding = ISO8859;
 
 int
 main(int argc, char **argv)
@@ -201,7 +202,8 @@
     if (!strcmp(arg, "-width"        )) { width = atoi(argv[++i]);       } else
     if (!strcmp(arg, "-o"            )) { output_file_name = argv[++i];  } else
     if (!strcmp(arg, "-nobs"         )) { use_backspaces = false;        } else
-    if (!strcmp(arg, "-ascii"        )) { use_iso8859 = false;           } else
+    if (!strcmp(arg, "-ascii"        )) { use_encoding = ASCII;          } else
+    if (!strcmp(arg, "-utf8"         )) { use_encoding = UTF8;           } else
     {
       std::cerr
 	<< "Unrecognized command line option \""
Index: html2text-1.3.2a/html.h
===================================================================
--- html2text-1.3.2a.orig/html.h	2008-09-20 14:01:44.343193129 +0300
+++ html2text-1.3.2a/html.h	2008-09-20 14:06:15.259781132 +0300
@@ -61,6 +61,11 @@
 
 /* ------------------------------------------------------------------------- */
 
+enum {ASCII, ISO8859, UTF8};
+#define USE_ISO8859 (use_encoding == ISO8859)
+#define USE_ASCII (use_encoding == ASCII)
+#define USE_UTF8 (use_encoding == UTF8)
+
 #define LATIN1_nbsp   160
 #define LATIN1_iexcl  161
 #define LATIN1_cent   162
Index: html2text-1.3.2a/sgml.C
===================================================================
--- html2text-1.3.2a.orig/sgml.C	2008-09-20 14:01:44.391192735 +0300
+++ html2text-1.3.2a/sgml.C	2008-09-20 14:06:15.259781132 +0300
@@ -62,261 +62,280 @@
   char name[8];
   int  iso8859code;
   char *asciistr;
+  unsigned long unicode;
 } entities[] = {
-  { "AElig",   LATIN1_AElig,  "AE"         },
-  { "AMP",     0,             "&"          },
-  { "Aacute",  LATIN1_Aacute, "A'"         },
-  { "Acirc",   LATIN1_Acirc,  "A^"         },
-  { "Agrave",  LATIN1_Agrave, "A`"         },
-  { "Alpha",   0,             "A"          },
-  { "Aring",   LATIN1_Aring,  "AA"         },
-  { "Atilde",  LATIN1_Atilde, "A~"         },
-  { "Auml",    LATIN1_Auml,   "A\""        },
-  { "Beta",    0,             "B"          },
-  { "Ccedil",  LATIN1_Ccedil, "C,"         },
-  { "Chi",     0,             "H"          },
-  { "Dagger",  0,             "++"         },
-  { "Delta",   0,             "D"          },
-  { "ETH",     LATIN1_ETH,    "D-"         },
-  { "Eacute",  LATIN1_Eacute, "E'"         },
-  { "Ecirc",   LATIN1_Ecirc,  "E^"         },
-  { "Egrave",  LATIN1_Egrave, "E`"         },
-  { "Epsilon", 0,             "E"          },
-  { "Eta",     0,             "E"          },
-  { "Euml",    LATIN1_Euml,   "E\""        },
-  { "GT",      0,             ">"          },
-  { "Gamma",   0,             "G"          },
-  { "Iacute",  LATIN1_Iacute, "I'"         },
-  { "Icirc",   LATIN1_Icirc,  "I^"         },
-  { "Igrave",  LATIN1_Igrave, "I`"         },
-  { "Iota",    0,             "I"          },
-  { "Iuml",    LATIN1_Iuml,   "I\""        },
-  { "Kappa",   0,             "K"          },
-  { "LT",      0,             "<"          },
-  { "Lambda",  0,             "L"          },
-  { "Mu",      0,             "M"          },
-  { "Ntilde",  LATIN1_Ntilde, "N~"         },
-  { "Nu",      0,             "N"          },
-  { "OElig",   0,             "OE"         },
-  { "Oacute",  LATIN1_Oacute, "O'"         },
-  { "Ocirc",   LATIN1_Ocirc,  "O^"         },
-  { "Ograve",  LATIN1_Ograve, "O`"         },
-  { "Omega",   0,             "O"          },
-  { "Omicron", 0,             "O"          },
-  { "Oslash",  LATIN1_Oslash, "O/"         },
-  { "Otilde",  LATIN1_Otilde, "O~"         },
-  { "Ouml",    LATIN1_Ouml,   "O\""        },
-  { "Phi",     0,             "F"          },
-  { "Pi",      0,             "P"          },
-  { "Prime",   0,             "''"         },
-  { "Psi",     0,             "PS"         },
-  { "QUOT",    0,             "\""         },
-  { "Rho",     0,             "R"          },
-  { "Scaron",  0,             "S"          },
-  { "Sigma",   0,             "S"          },
-  { "THORN",   LATIN1_THORN,  "TH"         },
-  { "Tau",     0,             "T"          },
-  { "Theta",   0,             "TH"         },
-  { "Uacute",  LATIN1_Uacute, "U'"         },
-  { "Ucirc",   LATIN1_Ucirc,  "U^"         },
-  { "Ugrave",  LATIN1_Ugrave, "U`"         },
-  { "Upsilon", 0,             "U"          },
-  { "Uuml",    LATIN1_Uuml,   "U\""        },
-  { "Xi",      0,             "X"          },
-  { "Yacute",  LATIN1_Yacute, "Y'"         },
-  { "Yuml",    0,             "Y\""        },
-  { "Zeta",    0,             "Z"          },
-  { "aacute",  LATIN1_aacute, "a'"         },
-  { "acirc",   LATIN1_acirc,  "a^"         },
-  { "acute",   LATIN1_acute,  "'"          },
-  { "aelig",   LATIN1_aelig,  "ae"         },
-  { "agrave",  LATIN1_agrave, "a`"         },
+  { "AElig",   LATIN1_AElig,  "AE",  0x00c6},
+  { "AMP",     0,             "&",   0x0026},
+  { "Aacute",  LATIN1_Aacute, "A'",  0x00c1},
+  { "Acirc",   LATIN1_Acirc,  "A^",  0x00c2},
+  { "Agrave",  LATIN1_Agrave, "A`",  0x00c0},
+  { "Alpha",   0,             "A",   0x0391},
+  { "Aring",   LATIN1_Aring,  "AA",  0x00c5},
+  { "Atilde",  LATIN1_Atilde, "A~",  0x00c3},
+  { "Auml",    LATIN1_Auml,   "A\"", 0x00c4},
+  { "Beta",    0,             "B",   0x0392},
+  { "Ccedil",  LATIN1_Ccedil, "C,",  0x00c7},
+  { "Chi",     0,             "H",   0x03a7},
+  { "Dagger",  0,             "++",  0x2020},
+  { "Delta",   0,             "D",   0x0394},
+  { "ETH",     LATIN1_ETH,    "D-",  0x00d0},
+  { "Eacute",  LATIN1_Eacute, "E'",  0x00c9},
+  { "Ecirc",   LATIN1_Ecirc,  "E^",  0x00ca},
+  { "Egrave",  LATIN1_Egrave, "E`",  0x00c8},
+  { "Epsilon", 0,             "E",   0x0395},
+  { "Eta",     0,             "E",   0x0397},
+  { "Euml",    LATIN1_Euml,   "E\"", 0x00cb},
+  { "GT",      0,             ">",   0x003e},
+  { "Gamma",   0,             "G",   0x0393},
+  { "Iacute",  LATIN1_Iacute, "I'",  0x00cd},
+  { "Icirc",   LATIN1_Icirc,  "I^",  0x00ce},
+  { "Igrave",  LATIN1_Igrave, "I`",  0x00cc},
+  { "Iota",    0,             "I",   0x0399},
+  { "Iuml",    LATIN1_Iuml,   "I\"", 0x00cf},
+  { "Kappa",   0,             "K",   0x039a},
+  { "LT",      0,             "<",   0x003c},
+  { "Lambda",  0,             "L",   0x039b},
+  { "Mu",      0,             "M",   0x039c},
+  { "Ntilde",  LATIN1_Ntilde, "N~",  0x00d1},
+  { "Nu",      0,             "N",   0x039d},
+  { "OElig",   0,             "OE",  0x0152},
+  { "Oacute",  LATIN1_Oacute, "O'",  0x00d3},
+  { "Ocirc",   LATIN1_Ocirc,  "O^",  0x00d4},
+  { "Ograve",  LATIN1_Ograve, "O`",  0x00d2},
+  { "Omega",   0,             "O",   0x03a9},
+  { "Omicron", 0,             "O",   0x039f},
+  { "Oslash",  LATIN1_Oslash, "O/",  0x00d8},
+  { "Otilde",  LATIN1_Otilde, "O~",  0x00d5},
+  { "Ouml",    LATIN1_Ouml,   "O\"", 0x00d6},
+  { "Phi",     0,             "F",   0x03a6},
+  { "Pi",      0,             "P",   0x03a0},
+  { "Prime",   0,             "''",        },
+  { "Psi",     0,             "PS",  0x03a8},
+  { "QUOT",    0,             "\"",        },
+  { "Rho",     0,             "R",   0x03a1},
+  { "Scaron",  0,             "S",   0x0161},
+  { "Sigma",   0,             "S",   0x03a3},
+  { "THORN",   LATIN1_THORN,  "TH",  0x00de},
+  { "Tau",     0,             "T",   0x03a4},
+  { "Theta",   0,             "TH",  0x0398},
+  { "Uacute",  LATIN1_Uacute, "U'",  0x00da},
+  { "Ucirc",   LATIN1_Ucirc,  "U^",  0x00db},
+  { "Ugrave",  LATIN1_Ugrave, "U`",  0x00d9},
+  { "Upsilon", 0,             "U",   0x03a5},
+  { "Uuml",    LATIN1_Uuml,   "U\"", 0x00dc},
+  { "Xi",      0,             "X",   0x039e},
+  { "Yacute",  LATIN1_Yacute, "Y'",  0x00dd},
+  { "Yuml",    0,             "Y\"", 0x0178},
+  { "Zeta",    0,             "Z",   0x0396},
+  { "aacute",  LATIN1_aacute, "a'",  0x00e1},
+  { "acirc",   LATIN1_acirc,  "a^",  0x00e2},
+  { "acute",   LATIN1_acute,  "'",   0x00b4},
+  { "aelig",   LATIN1_aelig,  "ae",  0x00e6},
+  { "agrave",  LATIN1_agrave, "a`",  0x00e0},
   { "alefsym", 0,             "Aleph"      },
-  { "alpha",   0,             "a"          },
+  { "alpha",   0,             "a",   0x03b1},
   { "amp",     0,             "&"          },
   { "and",     0,             "AND"        },
   { "ang",     0,             "-V"         },
   { "apos",    0,             "'"          },
-  { "aring",   LATIN1_aring,  "aa"         },
-  { "asymp",   0,             "~="         },
-  { "atilde",  LATIN1_atilde, "a~"         },
-  { "auml",    LATIN1_auml,   "a\""        },
+  { "aring",   LATIN1_aring,  "aa",  0x00e5},
+  { "asymp",   0,             "~=",  0x2248},
+  { "atilde",  LATIN1_atilde, "a~",  0x00e3},
+  { "auml",    LATIN1_auml,   "a\"", 0x00e5},
   { "bdquo",   0,             "\""         },
-  { "beta",    0,             "b"          },
-  { "brvbar",  LATIN1_brvbar, "|"          },
-  { "bull",    0,             " o "        },
+  { "beta",    0,             "b",   0x03b2},
+  { "brvbar",  LATIN1_brvbar, "|",   0x00a6},
+  { "bull",    0,             " o ", 0x2022},
   { "cap",     0,             "(U"         },
-  { "ccedil",  LATIN1_ccedil, "c,"         },
-  { "cedil",   LATIN1_cedil,  ","          },
-  { "cent",    LATIN1_cent,   "-c-"        },
-  { "chi",     0,             "h"          },
-  { "circ",    0,             "^"          },
+  { "ccedil",  LATIN1_ccedil, "c,",  0x00e7},
+  { "cedil",   LATIN1_cedil,  ",",   0x00b8},
+  { "cent",    LATIN1_cent,   "-c-", 0x00a2},
+  { "chi",     0,             "h",   0x03c7},
+  { "circ",    0,             "^",   0x005e},
 //  { "clubs",   0,             "[clubs]"    },
   { "cong",    0,             "?="         },
-  { "copy",    LATIN1_copy,   "(c)"        },
+  { "copy",    LATIN1_copy,   "(c)", 0x00a9},
   { "crarr",   0,             "<-'"        },
   { "cup",     0,             ")U"         },
-  { "curren",  LATIN1_curren, "CUR"        },
+  { "curren",  LATIN1_curren, "CUR", 0x00a4},
   { "dArr",    0,             "vv"         },
-  { "dagger",  0,             "+"          },
+  { "dagger",  0,             "+",   0x2020},
   { "darr",    0,             "v"          },
-  { "deg",     LATIN1_deg,    "DEG"        },
-  { "delta",   0,             "d"          },
+  { "deg",     LATIN1_deg,    "DEG", 0x00b0},
+  { "delta",   0,             "d",   0x03b4},
 //  { "diams",   0,             "[diamonds]" },
-  { "divide",  LATIN1_divide, "/"          },
-  { "eacute",  LATIN1_eacute, "e'"         },
-  { "ecirc",   LATIN1_ecirc,  "e^"         },
-  { "egrave",  LATIN1_egrave, "e`"         },
+  { "divide",  LATIN1_divide, "/",   0x00f7},
+  { "eacute",  LATIN1_eacute, "e'",  0x00e9},
+  { "ecirc",   LATIN1_ecirc,  "e^",  0x00ea},
+  { "egrave",  LATIN1_egrave, "e`",  0x00e8},
   { "empty",   0,             "{}"         },
-  { "epsilon", 0,             "e"          },
-  { "equiv",   0,             "=="         },
-  { "eta",     0,             "e"          },
-  { "eth",     LATIN1_eth,    "d-"         },
-  { "euml",    LATIN1_euml,   "e\""        },
-  { "euro",    0,             "EUR"        },
+  { "epsilon", 0,             "e",   0x03b5},
+  { "equiv",   0,             "==",  0x2261},
+  { "eta",     0,             "e",   0x03b7},
+  { "eth",     LATIN1_eth,    "d-",  0x00f0},
+  { "euml",    LATIN1_euml,   "e\"", 0x00eb},
+  { "euro",    0,             "EUR", 0x20ac},
   { "exist",   0,             "TE"         },
   { "fnof",    0,             "f"          },
   { "forall",  0,             "FA"         },
-  { "frac12",  LATIN1_frac12, " 1/2"       },
-  { "frac14",  LATIN1_frac14, " 1/4"       },
-  { "frac34",  LATIN1_frac34, " 3/4"       },
+  { "frac12",  LATIN1_frac12, " 1/2",0x00bd},
+  { "frac14",  LATIN1_frac14, " 1/4",0x00bc},
+  { "frac34",  LATIN1_frac34, " 3/4",0x00be},
   { "frasl",   0,             "/"          },
-  { "gamma",   0,             "g"          },
-  { "ge",      0,             ">="         },
-  { "gt",      0,             ">"          },
+  { "gamma",   0,             "g",   0x03b3},
+  { "ge",      0,             ">=",  0x2265},
+  { "gt",      0,             ">",   0x003e},
   { "hArr",    0,             "<=>"        },
   { "harr",    0,             "<->"        },
 //  { "hearts",  0,             "[hearts]"   },
-  { "hellip",  0,             "..."        },
-  { "iacute",  LATIN1_iacute, "i'"         },
-  { "icirc",   LATIN1_icirc,  "i^"         },
-  { "iexcl",   LATIN1_iexcl,  "!"          },
-  { "igrave",  LATIN1_igrave, "i`"         },
+  { "hellip",  0,             "...", 0x2026},
+  { "iacute",  LATIN1_iacute, "i'",  0x00ed},
+  { "icirc",   LATIN1_icirc,  "i^",  0x00ee},
+  { "iexcl",   LATIN1_iexcl,  "!",   0x00a1},
+  { "igrave",  LATIN1_igrave, "i`",  0x00ec},
   { "image",   0,             "Im"         },
-  { "infin",   0,             "oo"         },
-  { "int",     0,             "INT"        },
-  { "iota",    0,             "i"          },
-  { "iquest",  LATIN1_iquest, "?"          },
+  { "infin",   0,             "oo",  0x221e},
+  { "int",     0,             "INT", 0x222b},
+  { "iota",    0,             "i",   0x03b9},
+  { "iquest",  LATIN1_iquest, "?",   0x00bf},
   { "isin",    0,             "(-"         },
-  { "iuml",    LATIN1_iuml,   "i\""        },
-  { "kappa",   0,             "k"          },
+  { "iuml",    LATIN1_iuml,   "i\"", 0x00ef},
+  { "kappa",   0,             "k",   0x03ba},
   { "lArr",    0,             "<="         },
-  { "lambda",  0,             "l"          },
+  { "lambda",  0,             "l",   0x03bb},
   { "lang",    0,             "</"         },
   { "laquo",   LATIN1_laquo,  "<<"         },
-  { "larr",    0,             "<-"         },
+  { "larr",    0,             "<-",  0x2190},
 //  { "lceil",   0,             "<|"         },
   { "ldquo",   0,             "\""         },
-  { "le",      0,             "<="         },
+  { "le",      0,             "<=",  0x2264},
 //  { "lfloor",  0,             "|<"         },
   { "lowast",  0,             "*"          },
   { "loz",     0,             "<>"         },
   { "lsaquo",  0,             "<"          },
   { "lsquo",   0,             "`"          },
-  { "lt",      0,             "<"          },
-  { "macr",    LATIN1_macr,   "-"          },
+  { "lt",      0,             "<",   0x003c},
+  { "macr",    LATIN1_macr,   "-",   0x00af},
   { "mdash",   0,             "--"         },
-  { "micro",   LATIN1_micro,  "my"         },
-  { "middot",  LATIN1_middot, "."          },
-  { "minus",   0,             "-"          },
-  { "mu",      0,             "m"          },
+  { "micro",   LATIN1_micro,  "my",  0x00b5},
+  { "middot",  LATIN1_middot, ".",   0x00b7},
+  { "minus",   0,             "-",   0x2212},
+  { "mu",      0,             "m",   0x03bc},
   { "nabla",   0,             "Nabla"      },
-  { "nbsp",    LATIN1_nbsp,   " "          },
+  { "nbsp",    LATIN1_nbsp,   " ",   0x00a0},
   { "ndash",   0,             "-"          },
-  { "ne",      0,             "!="         },
+  { "ne",      0,             "!=",  0x2260},
   { "ni",      0,             "-)"         },
   { "not",     LATIN1_not,    "NOT"        },
   { "notin",   0,             "!(-"        },
   { "nsub",    0,             "!(C"        },
-  { "ntilde",  LATIN1_ntilde, "n~"         },
-  { "nu",      0,             "n"          },
-  { "oacute",  LATIN1_oacute, "o'"         },
-  { "ocirc",   LATIN1_ocirc,  "o^"         },
+  { "ntilde",  LATIN1_ntilde, "n~",  0x00f1},
+  { "nu",      0,             "n",   0x03bd},
+  { "oacute",  LATIN1_oacute, "o'",  0x00f3},
+  { "ocirc",   LATIN1_ocirc,  "o^",  0x00f4},
   { "oelig",   0,             "oe"         },
-  { "ograve",  LATIN1_ograve, "o`"         },
+  { "ograve",  LATIN1_ograve, "o`",  0x00f2},
   { "oline",   LATIN1_macr,   "-"          },
-  { "omega",   0,             "o"          },
-  { "omicron", 0,             "o"          },
+  { "omega",   0,             "o",   0x03c9},
+  { "omicron", 0,             "o",   0x03bf},
   { "oplus",   0,             "(+)"        },
   { "or",      0,             "OR"         },
-  { "ordf",    LATIN1_ordf,   "-a"         },
-  { "ordm",    LATIN1_ordm,   "-o"         },
-  { "oslash",  LATIN1_oslash, "o/"         },
-  { "otilde",  LATIN1_otilde, "o~"         },
+  { "ordf",    LATIN1_ordf,   "-a",  0x00aa},
+  { "ordm",    LATIN1_ordm,   "-o",  0x00ba},
+  { "oslash",  LATIN1_oslash, "o/",  0x00f8},
+  { "otilde",  LATIN1_otilde, "o~",  0x00f5},
   { "otimes",  0,             "(x)"        },
-  { "ouml",    LATIN1_ouml,   "o\""        },
-  { "para",    LATIN1_para,   "P:"         },
-  { "part",    0,             "PART"       },
-  { "permil",  0,             " 0/00"      },
+  { "ouml",    LATIN1_ouml,   "o\"", 0x00f6},
+  { "para",    LATIN1_para,   "P:",  0x00b6},
+  { "part",    0,             "PART",0x2202},
+  { "permil",  0,             " 0/00",0x2030},
   { "perp",    0,             "-T"         },
-  { "phi",     0,             "f"          },
-  { "pi",      0,             "p"          },
+  { "phi",     0,             "f",   0x03c6},
+  { "pi",      0,             "p",   0x03c0},
   { "piv",     0,             "Pi"         },
-  { "plusmn",  LATIN1_plusmn, "+/-"        },
-  { "pound",   LATIN1_pound,  "-L-"        },
+  { "plusmn",  LATIN1_plusmn, "+/-", 0x00b1},
+  { "pound",   LATIN1_pound,  "-L-", 0x00a3},
   { "prime",   0,             "'"          },
-  { "prod",    0,             "PROD"       },
+  { "prod",    0,             "PROD",0x220f},
   { "prop",    0,             "0("         },
-  { "psi",     0,             "ps"         },
+  { "psi",     0,             "ps",  0x03c8},
   { "quot",    0,             "\""         },
   { "rArr",    0,             "=>"         },
-  { "radic",   0,             "SQRT"       },
+  { "radic",   0,             "SQRT",0x221a},
   { "rang",    0,             "/>"         },
   { "raquo",   LATIN1_raquo,  ">>"         },
-  { "rarr",    0,             "->"         },
+  { "rarr",    0,             "->",  0x2192},
 //  { "rceil",   0,             ">|"         },
   { "rdquo",   0,             "\""         },
   { "real",    0,             "Re"         },
-  { "reg",     LATIN1_reg,    "(R)"        },
+  { "reg",     LATIN1_reg,    "(R)", 0x00ae},
 //  { "rfloor",  0,             "|>"         },
-  { "rho",     0,             "r"          },
+  { "rho",     0,             "r",   0x03c1},
   { "rsaquo",  0,             ">"          },
   { "rsquo",   0,             "'"          },
   { "sbquo",   0,             "'"          },
-  { "scaron",  0,             "s"          },
+  { "scaron",  0,             "s",   0x0161},
   { "sdot",    0,             "DOT"        },
-  { "sect",    LATIN1_sect,   "S:"         },
+  { "sect",    LATIN1_sect,   "S:",  0x00a7},
   { "shy",     LATIN1_shy,    ""           },
-  { "sigma",   0,             "s"          },
-  { "sigmaf",  0,             "s"          },
+  { "sigma",   0,             "s",   0x03c3},
+  { "sigmaf",  0,             "s",   0x03c2},
   { "sim",     0,             "~"          },
 //  { "spades",  0,             "[spades]"   },
   { "sub",     0,             "(C"         },
   { "sube",    0,             "(_"         },
-  { "sum",     0,             "SUM"        },
+  { "sum",     0,             "SUM", 0x2211},
   { "sup",     0,             ")C"         },
-  { "sup1",    LATIN1_sup1,   "^1"         },
-  { "sup2",    LATIN1_sup2,   "^2"         },
-  { "sup3",    LATIN1_sup3,   "^3"         },
+  { "sup1",    LATIN1_sup1,   "^1",  0x00b9},
+  { "sup2",    LATIN1_sup2,   "^2",  0x00b2},
+  { "sup3",    LATIN1_sup3,   "^3",  0x00b3},
   { "supe",    0,             ")_"         },
-  { "szlig",   LATIN1_szlig,  "ss"         },
-  { "tau",     0,             "t"          },
+  { "szlig",   LATIN1_szlig,  "ss",  0x00df},
+  { "tau",     0,             "t",   0x03c4},
   { "there4",  0,             ".:"         },
-  { "theta",   0,             "th"         },
-  { "thorn",   LATIN1_thorn,  "th"         },
-  { "tilde",   0,             "~"          },
-  { "times",   LATIN1_times,  "x"          },
-  { "trade",   0,             "[TM]"       },
+  { "theta",   0,             "th",  0x03b8},
+  { "thorn",   LATIN1_thorn,  "th",  0x00fe},
+  { "tilde",   0,             "~",   0x02dc},
+  { "times",   LATIN1_times,  "x",   0x00d7},
+  { "trade",   0,             "[TM]",0x2122},
   { "uArr",    0,             "^^"         },
-  { "uacute",  LATIN1_uacute, "u'"         },
+  { "uacute",  LATIN1_uacute, "u'",  0x00fa},
   { "uarr",    0,             "^"          },
-  { "ucirc",   LATIN1_ucirc,  "u^"         },
-  { "ugrave",  LATIN1_ugrave, "u`"         },
-  { "uml",     LATIN1_uml,    "\""         },
-  { "upsilon", 0,             "u"          },
-  { "uuml",    LATIN1_uuml,   "u\""        },
+  { "ucirc",   LATIN1_ucirc,  "u^",  0x00fb},
+  { "ugrave",  LATIN1_ugrave, "u`",  0x00f9},
+  { "uml",     LATIN1_uml,    "\"",  0x00a8},
+  { "upsilon", 0,             "u",   0x03c5},
+  { "uuml",    LATIN1_uuml,   "u\"", 0x00fc},
   { "weierp",  0,             "P"          },
-  { "xi",      0,             "x"          },
-  { "yacute",  LATIN1_yacute, "y'"         },
-  { "yen",     LATIN1_yen,    "YEN"        },
-  { "yuml",    LATIN1_yuml,   "y\""        },
-  { "zeta",    0,             "z"          },
+  { "xi",      0,             "x",   0x03be},
+  { "yacute",  LATIN1_yacute, "y'",  0x00fd},
+  { "yen",     LATIN1_yen,    "YEN", 0x00a5},
+  { "yuml",    LATIN1_yuml,   "y\"", 0x00ff},
+  { "zeta",    0,             "z",   0x03b6},
 };
 
-extern int use_iso8859;
+extern int use_encoding;
 
 /* ------------------------------------------------------------------------- */
 
+char ubuf[4];
+
+char *mkutf(unsigned long x)
+{
+  memset(ubuf, 0, 4);
+  if(x < 128) ubuf[0] = x;
+  else if(x < 0x800) {
+     ubuf[0] = (0xc0 | ((x >> 6) & 0x1f));
+     ubuf[1] = (0x80 | (x & 0x3f));
+  }
+  else {
+     ubuf[0] = (0xe0 | ((x >> 12) & 0x0f));
+     ubuf[1] = (0x80 | ((x >> 6) & 0x3f));
+     ubuf[2] = (0x80 | (x & 0x3f));
+  }
+  return ubuf;
+}
+
 void
 replace_sgml_entities(string *s)
 {
@@ -330,9 +349,9 @@
      */
     while (j < l && s->at(j) != '&') ++j;
     /*
-     * We could convert high-bit chars to "&#233;" here if use_iso8859
-     * is off, then let them be translated or not.  Is the purpose of
-     * !use_iso8859 to allow SGML entities to be seen, or to strongly
+     * We could convert high-bit chars to "&#233;" here if USE_ASCII
+     * is on, then let them be translated or not.  Is the purpose of
+     * USE_ASCII to allow SGML entities to be seen, or to strongly
      * filter against high-ASCII chars that might blow up a terminal
      * that doesn't speak ISO8859?  For the moment, "allow SGML entities
      * to be seen" -- no filtering here.
@@ -370,7 +389,11 @@
           if (!isdigit(c)) break;
           x = 10 * x + c - '0';
         }
-        if (use_iso8859 || (x < 128)) {
+        if (USE_UTF8) {
+          s->replace(beg, j - beg, mkutf(x));
+          j = beg + 1;
+        }
+        else if (USE_ISO8859 && (x < 256) || USE_ASCII && (x < 128)) {
         s->replace(beg, j - beg, 1, (char) x);
         j = beg + 1;
         } else {
@@ -408,13 +431,17 @@
         (int (*)(const void *, const void *)) strcmp
       );
       if (entity != NULL) {
-        if (use_iso8859 && entity->iso8859code) {
+        if (USE_ISO8859 && entity->iso8859code) {
           s->replace(beg, j - beg, 1, (char) entity->iso8859code);
           j = beg + 1;
-        } else if (entity->asciistr) {
+        } else if (USE_ASCII && entity->asciistr) {
           s->replace(beg, j - beg, entity->asciistr);
         j = beg + 1;
         } /* else don't replace it at all, we don't have a translation */
+        else if(USE_UTF8 && entity->unicode) {
+        s->replace(beg, j - beg, mkutf(entity->unicode));
+        j = beg + 1;
+        }
       }
     } else {
       ;                         /* EXTENSION: Allow literal '&' sometimes. */
Index: html2text-1.3.2a/table.C
===================================================================
--- html2text-1.3.2a.orig/table.C	2008-09-20 14:01:44.415186916 +0300
+++ html2text-1.3.2a/table.C	2008-09-20 14:06:15.259781132 +0300
@@ -175,7 +175,7 @@
           - (*number_of_columns_return - 1) * (column_spacing + 0),
           Area::LEFT // Yields better results than "p->halign"!
         ));
-	p->width = tmp.get() ? tmp->width() : 0;
+	p->width = tmp.get() ? tmp->utf_width() : 0;
       }
       p->minimized = false;
 
@@ -308,7 +308,7 @@
 	left_of_column + old_column_width - 1,
 	Area::LEFT // Yields better results than "lc.halign"!
       ));
-      w = tmp->width();
+      w = tmp->utf_width();
       if (w >= left_of_column + old_column_width) lc.minimized = true;
     }
     if (w > left_of_column + new_column_width) {

Places

File html2text-debian-500_utf8_support.patch of Package html2text

Places