Recode input according to 'meta http-equiv' in html document.
Index: html2text-1.3.2a/html2text.C
===================================================================
--- html2text-1.3.2a.orig/html2text.C 2008-09-20 14:06:46.787386246 +0300
+++ html2text-1.3.2a/html2text.C 2008-09-20 14:09:32.009308515 +0300
@@ -37,9 +37,15 @@
#include <iostream>
#include <fstream>
+#include <sstream>
+#include <algorithm>
+#include <iterator>
#include <string.h>
#include <stdlib.h>
+#include <iconv.h>
+#include <errno.h>
+
#include "html.h"
#include "HTMLControl.h"
//#include "urlistream.h"
@@ -50,11 +56,15 @@
/* ------------------------------------------------------------------------- */
using std::ifstream;
+using std::stringstream;
+using std::istream_iterator;
+using std::ostream_iterator;
class MyParser : public HTMLControl {
public:
enum { PRINT_AS_ASCII, UNPARSE, SYNTAX_CHECK };
+ string meta_encoding;
MyParser(
istream &is_,
@@ -106,6 +116,23 @@
/*virtual*/ void
MyParser::process(const Document &document)
{
+ list<auto_ptr<Meta> >::const_iterator i;
+ for(i = document.head.metas.begin(); i != document.head.metas.end(); ++i) {
+ bool exists = false;
+ get_attribute(i->get()->attributes.get(), "http-equiv", &exists);
+ if (exists) {
+ string content = get_attribute(i->get()->attributes.get(), "content", "");
+ char to_find[] = "charset=";
+ string::size_type found_pos = content.find(to_find);
+ if (found_pos != string::npos)
+ {
+ this->meta_encoding = content.substr(found_pos + sizeof(to_find) - 1);
+ //std::cerr << this->meta_encoding << std::endl;
+ }
+ break;
+ }
+ }
+
switch (mode) {
case PRINT_AS_ASCII:
@@ -126,6 +153,70 @@
}
}
+bool recode(stringstream& stream, const char* to_encoding, const char* from_encoding)
+{
+ iconv_t iconv_handle = iconv_open(to_encoding, from_encoding);
+ if (iconv_handle != iconv_t(-1))
+ {
+ stream.seekg(0);
+ string input_string = stream.str();
+ size_t input_size = input_string.size();
+ char* raw_input = new char[input_size+1];
+ char* const orig_raw_input = raw_input;
+ strcpy(raw_input, input_string.data());
+ size_t max_output_size = input_size * 4; // maximum possible overhead
+ char* raw_output = new char[max_output_size+1];
+ char* const orig_raw_output = raw_output;
+ size_t iconv_value =
+ iconv(iconv_handle, &raw_input, &input_size, &raw_output, &max_output_size);
+
+ if (iconv_value != (size_t)-1)
+ {
+ *raw_output = '\0';
+ stream.str(string(orig_raw_output));
+ /* debug */
+ //std::copy(istream_iterator<char>(input_stream), istream_iterator<char>(), ostream_iterator<char>(std::cerr));
+ }
+
+ delete [] orig_raw_input;
+ delete [] orig_raw_output;
+ iconv_close(iconv_handle);
+
+ if (iconv_value == (size_t)-1)
+ {
+ std::cerr << "Input recoding failed due to ";
+ if (errno == EILSEQ)
+ {
+ std::cerr << "invalid input sequence.";
+ /* debug */
+ std::cout << raw_input;
+ }
+ else
+ {
+ std::cerr << "unknown reason.";
+ }
+ std::cerr << std::endl;
+ return false;
+ }
+ }
+ else
+ {
+ if (errno == EINVAL)
+ {
+ std::cerr << "Recoding from '" << from_encoding
+ << "' to '" << to_encoding << "' is not available." << std::endl;
+ std::cerr << "Check that '" << from_encoding
+ << "' is a valid encoding." << std::endl;
+ }
+ else
+ {
+ std::cerr << "Error: cannot setup recoding." << std::endl;
+ }
+ return false;
+ }
+ return true;
+}
+
/* ------------------------------------------------------------------------- */
static const char *usage = "\
@@ -151,6 +242,7 @@
-nobs Do not use backspaces for boldface and underlining\n\
-ascii Use plain ASCII for output instead of ISO-8859-1\n\
-utf8 Assume both terminal and input stream are in UTF-8 mode\n\
+ -nometa Don't try to recode input using 'meta' tag\n\
";
int use_encoding = ISO8859;
@@ -188,6 +280,7 @@
int width = 79;
const char *output_file_name = "-";
bool use_backspaces = false;
+ bool use_meta = true;
int i;
for (i = 1; i < argc && argv[i][0] == '-' && argv[i][1]; i++) {
@@ -204,6 +297,7 @@
if (!strcmp(arg, "-nobs" )) { use_backspaces = false; } else
if (!strcmp(arg, "-ascii" )) { use_encoding = ASCII; } else
if (!strcmp(arg, "-utf8" )) { use_encoding = UTF8; } else
+ if (!strcmp(arg, "-nometa" )) { use_meta = false; } else
{
std::cerr
<< "Unrecognized command line option \""
@@ -356,30 +450,117 @@
}
istream *isp;
- ifstream uis;
+ istream *uis;
+ ifstream* infile = NULL;
+ stringstream input_stream;
+
+ if (strcmp(input_url, "-") == 0)
+ {
+ uis = &std::cin;
+ }
+ else
+ {
+ infile = new ifstream(input_url);
+ if (!infile->is_open())
+ {
+ delete infile;
+ std::cerr
+ << "Cannot open input file \""
+ << input_url
+ << "\"."
+ << std::endl;
+ exit(1);
+ }
+ uis = infile;
+ }
- uis.open(input_url);
- if (!uis.is_open()) {
- std::cerr
- << "Cannot open input file \""
- << input_url
- << "\"."
- << std::endl;
- exit(1);
+ *uis >> std::noskipws;
+ std::copy(istream_iterator<char>(*uis), istream_iterator<char>(), ostream_iterator<char>(input_stream));
+
+ if (infile)
+ {
+ infile->close();
+ delete infile;
+ }
+
+ string from_encoding;
+ if (use_meta)
+ {
+ std::ofstream fake_osp("/dev/null");
+ // fake parsing to determine meta
+ MyParser parser(
+ input_stream,
+ debug_scanner,
+ debug_parser,
+ fake_osp,
+ mode,
+ width,
+ input_url
+ );
+ if (parser.yyparse() != 0) exit(1);
+
+ from_encoding = parser.meta_encoding;
+
+ // don't need to debug twice ...
+ debug_scanner = false;
+ debug_parser = false;
+
+ /*
+ * It will be good to show warning in this case. But there are too many
+ * html documents without encoding info, so this branch is commented by
+ * now.
+ if (parser.meta_encoding.empty())
+ {
+ std::cerr << "Warning: cannot determine encoding from html file." << std::endl;
+ std::cerr << "To remove this warning, use '-nometa' option with, optionally, '-utf8' or '-ascii' options" << std::endl;
+ std::cerr << "to process file \"" << input_url << "\"." << std::endl;
+ }
+ */
+ }
+ if (from_encoding.empty()) // -nometa supplied or no appropriate tag
+ {
+ if (use_encoding == UTF8)
+ {
+ from_encoding = "UTF-8";
+ }
+ else if (use_encoding == ASCII)
+ {
+ from_encoding = "ASCII";
+ }
+ else
+ {
+ from_encoding = "ISO_8859-1";
+ }
+ }
+
+ // recode input
+ bool result = recode(input_stream, "UTF-8", from_encoding.data());
+ if (!result)
+ {
+ continue;
+ }
+
+ if (number_of_input_urls != 1) {
+ *osp << "###### " << input_url << " ######" << std::endl;
}
- MyParser parser(
- uis,
- debug_scanner,
- debug_parser,
- *osp,
- mode,
- width,
- input_url
- );
+ // real parsing now always process UTF-8
+ use_encoding = UTF8;
+ // real parsing
+ input_stream.clear();
+ input_stream.seekg(0);
+ MyParser parser(
+ input_stream,
+ debug_scanner,
+ debug_parser,
+ *osp,
+ mode,
+ width,
+ input_url
+ );
if (parser.yyparse() != 0) exit(1);
- uis.close();
+
}
return 0;