Sign Up
Log In
Log In
or
Sign Up
Places
All Projects
Status Monitor
Collapse sidebar
openSUSE:12.2:ARM
html2text
html2text-1.3.2a-611_recognize_input_encoding.p...
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File html2text-1.3.2a-611_recognize_input_encoding.patch of Package html2text
--- html2text-1.3.2a/html2text.C.orig 2009-01-22 17:02:57.000000000 +0100 +++ html2text-1.3.2a/html2text.C 2009-01-22 17:04:07.000000000 +0100 @@ -37,10 +37,16 @@ #include <iostream> #include <fstream> +#include <sstream> +#include <algorithm> +#include <iterator> #include <string.h> #include <stdlib.h> #include <unistd.h> +#include <iconv.h> +#include <errno.h> + #include "html.h" #include "HTMLControl.h" // #include "urlistream.h" @@ -52,11 +58,15 @@ /* ------------------------------------------------------------------------- */ using std::ifstream; +using std::stringstream; +using std::istream_iterator; +using std::ostream_iterator; class MyParser : public HTMLControl { public: enum { PRINT_AS_ASCII, UNPARSE, SYNTAX_CHECK }; + string meta_encoding; MyParser( istream &is_, @@ -108,6 +118,23 @@ MyParser::yyerror(char *p) /*virtual*/ void MyParser::process(const Document &document) { + list<auto_ptr<Meta> >::const_iterator i; + for(i = document.head.metas.begin(); i != document.head.metas.end(); ++i) { + bool exists = false; + get_attribute(i->get()->attributes.get(), "http-equiv", &exists); + if (exists) { + string content = get_attribute(i->get()->attributes.get(), "content", ""); + char to_find[] = "charset="; + string::size_type found_pos = content.find(to_find); + if (found_pos != string::npos) + { + this->meta_encoding = content.substr(found_pos + sizeof(to_find) - 1); + //std::cerr << this->meta_encoding << std::endl; + } + break; + } + } + switch (mode) { case PRINT_AS_ASCII: @@ -128,6 +155,70 @@ MyParser::process(const Document &docume } } +bool recode(stringstream& stream, const char* to_encoding, const char* from_encoding) +{ + iconv_t iconv_handle = iconv_open(to_encoding, from_encoding); + if (iconv_handle != iconv_t(-1)) + { + stream.seekg(0); + string input_string = stream.str(); + size_t input_size = input_string.size(); + char* raw_input = new char[input_size+1]; + char* const orig_raw_input = raw_input; + strcpy(raw_input, input_string.data()); + size_t max_output_size = input_size * 4; // maximum possible overhead + char* raw_output = new char[max_output_size+1]; + char* const orig_raw_output = raw_output; + size_t iconv_value = + iconv(iconv_handle, &raw_input, &input_size, &raw_output, &max_output_size); + + if (iconv_value != (size_t)-1) + { + *raw_output = '\0'; + stream.str(string(orig_raw_output)); + /* debug */ + //std::copy(istream_iterator<char>(input_stream), istream_iterator<char>(), ostream_iterator<char>(std::cerr)); + } + + delete [] orig_raw_input; + delete [] orig_raw_output; + iconv_close(iconv_handle); + + if (iconv_value == (size_t)-1) + { + std::cerr << "Input recoding failed due to "; + if (errno == EILSEQ) + { + std::cerr << "invalid input sequence."; + /* debug */ + std::cout << raw_input; + } + else + { + std::cerr << "unknown reason."; + } + std::cerr << std::endl; + return false; + } + } + else + { + if (errno == EINVAL) + { + std::cerr << "Recoding from '" << from_encoding + << "' to '" << to_encoding << "' is not available." << std::endl; + std::cerr << "Check that '" << from_encoding + << "' is a valid encoding." << std::endl; + } + else + { + std::cerr << "Error: cannot setup recoding." << std::endl; + } + return false; + } + return true; +} + /* ------------------------------------------------------------------------- */ static const char *usage = "\ @@ -153,6 +244,7 @@ text.\n\ -nobs Do not use backspaces for boldface and underlining\n\ -ascii Use plain ASCII for output instead of ISO-8859-1\n\ -utf8 Assume both terminal and input stream are in UTF-8 mode\n\ + -nometa Don't try to recode input using 'meta' tag\n\ "; int use_encoding = ISO8859; @@ -190,6 +282,7 @@ main(int argc, char **argv) int width = 79; const char *output_file_name = "-"; bool use_backspaces = false; + bool use_meta = true; int i; for (i = 1; i < argc && argv[i][0] == '-' && argv[i][1]; i++) { @@ -206,6 +299,7 @@ main(int argc, char **argv) if (!strcmp(arg, "-nobs" )) { use_backspaces = false; } else if (!strcmp(arg, "-ascii" )) { use_encoding = ASCII; } else if (!strcmp(arg, "-utf8" )) { use_encoding = UTF8; } else + if (!strcmp(arg, "-nometa" )) { use_meta = false; } else { std::cerr << "Unrecognized command line option \"" @@ -362,30 +456,117 @@ main(int argc, char **argv) } istream *isp; - ifstream uis; + istream *uis; + ifstream* infile = NULL; + stringstream input_stream; + + if (strcmp(input_url, "-") == 0) + { + uis = &std::cin; + } + else + { + infile = new ifstream(input_url); + if (!infile->is_open()) + { + delete infile; + std::cerr + << "Cannot open input file \"" + << input_url + << "\"." + << std::endl; + exit(1); + } + uis = infile; + } - uis.open(input_url); - if (!uis.is_open()) { - std::cerr - << "Cannot open input file \"" - << input_url - << "\"." - << std::endl; - exit(1); + *uis >> std::noskipws; + std::copy(istream_iterator<char>(*uis), istream_iterator<char>(), ostream_iterator<char>(input_stream)); + + if (infile) + { + infile->close(); + delete infile; + } + + string from_encoding; + if (use_meta) + { + std::ofstream fake_osp("/dev/null"); + // fake parsing to determine meta + MyParser parser( + input_stream, + debug_scanner, + debug_parser, + fake_osp, + mode, + width, + input_url + ); + if (parser.yyparse() != 0) exit(1); + + from_encoding = parser.meta_encoding; + + // don't need to debug twice ... + debug_scanner = false; + debug_parser = false; + + /* + * It will be good to show warning in this case. But there are too many + * html documents without encoding info, so this branch is commented by + * now. + if (parser.meta_encoding.empty()) + { + std::cerr << "Warning: cannot determine encoding from html file." << std::endl; + std::cerr << "To remove this warning, use '-nometa' option with, optionally, '-utf8' or '-ascii' options" << std::endl; + std::cerr << "to process file \"" << input_url << "\"." << std::endl; + } + */ + } + if (from_encoding.empty()) // -nometa supplied or no appropriate tag + { + if (use_encoding == UTF8) + { + from_encoding = "UTF-8"; + } + else if (use_encoding == ASCII) + { + from_encoding = "ASCII"; + } + else + { + from_encoding = "ISO_8859-1"; + } + } + + // recode input + bool result = recode(input_stream, "UTF-8", from_encoding.data()); + if (!result) + { + continue; + } + + if (number_of_input_urls != 1) { + *osp << "###### " << input_url << " ######" << std::endl; } - MyParser parser( - uis, - debug_scanner, - debug_parser, - *osp, - mode, - width, - input_url - ); + // real parsing now always process UTF-8 + use_encoding = UTF8; + // real parsing + input_stream.clear(); + input_stream.seekg(0); + MyParser parser( + input_stream, + debug_scanner, + debug_parser, + *osp, + mode, + width, + input_url + ); if (parser.yyparse() != 0) exit(1); - uis.close(); + } return 0;
Locations
Projects
Search
Status Monitor
Help
OpenBuildService.org
Documentation
API Documentation
Code of Conduct
Contact
Support
@OBShq
Terms
openSUSE Build Service is sponsored by
The Open Build Service is an
openSUSE project
.
Sign Up
Log In
Places
Places
All Projects
Status Monitor