/*****************************************************************************/ /* HTML.c Processing associated with reading and composing HTML-format messages. COPYRIGHT --------- Copyright (C) 2005-2025 Mark G.Daniel This program, comes with ABSOLUTELY NO WARRANTY. This is free software, and you are welcome to redistribute it under the conditions of the GNU GENERAL PUBLIC LICENSE, version 3, or any later version. VERSION HISTORY --------------- 01-JUN-2025 MGD HTML manipulation now performed using JavaScript 25-FEB-2010 MGD HtmlToPlain() supply charset for UTF-8 deentify HtmlCharacterDeEntify() allow for UTF-8 charset 04-JUL-2006 MGD bugfix; HtmlSkipTagTo() quotes outside of tags 20-JUN-2006 MGD HtmlInDisguise() is it really not plain but HTML text? 14-MAR-2006 MGD refine HtmlNumericDeEntify() and HtmlCharacterDeEntify() 13-MAR-2006 MGD HtmlSanitise() suppress "background-image" 01-FEB-2005 MGD initial */ /*****************************************************************************/ #ifdef SOYMAIL_VMS_V7 #undef _VMS_V6_SOURCE #define _VMS_V6_SOURCE #undef __VMS_VER #define __VMS_VER 70000000 #undef __CRTL_VER #define __CRTL_VER 70000000 #endif #pragma nomember_alignment /* standard C header files */ #include #include #include #include #include #include #include #include /* VMS related header files */ #include #include #include /* application header file */ #include "soymail.h" #include "cgilib.h" #include "config.h" #include "html.h" #include "message.h" #define FI_LI "HTML", __LINE__ /******************/ /* global storage */ /******************/ /********************/ /* external storage */ /********************/ extern BOOL Debug, WatchEnabled; extern char SoyMailQueryVersion[]; extern CONFIG_DATA SoyMailConfig; /*****************************************************************************/ /* The content is already plain. Just copy it in. */ void HtmlIsPlain (char *PlainPtr) { /*********/ /* begin */ /*********/ if (WatchEnabled) WatchThis (FI_LI, "HtmlIsPlain()"); HtmlByJavaScript (PlainPtr, 1); } /*****************************************************************************/ /* Reduce HTML markup down to a semblance of plain-text. Relies on the underlying HTML to provide white-space, indenting, etc. Essentially just strips the markup out and adds carriage-control as necessary. Will format ordered and unordered lists. */ void HtmlToPlain (char *HtmlPtr) { /*********/ /* begin */ /*********/ if (WatchEnabled) WatchThis (FI_LI, "HtmlToPlain()"); HtmlByJavaScript (HtmlPtr, 2); } /*****************************************************************************/ /* */ void HtmlSanitise (char *HtmlPtr) { /*********/ /* begin */ /*********/ if (WatchEnabled) WatchThis (FI_LI, "HtmlSanitise()"); HtmlByJavaScript (HtmlPtr, 3); } /*****************************************************************************/ /* MIXES of fprintf() and fwrite() (the more efficient) seem fraught with unexpected newlines!! So fprintf("%*.*s")! */ void HtmlByJavaScript ( char *HtmlPtr, int mode ) { int size; char *bptr, *cptr, *sptr, *zptr; /*********/ /* begin */ /*********/ if (WatchEnabled) WatchThis (FI_LI, "HtmlByJavaScript() !UL", mode); /*** 'protxt' is the compose prologue 'msgtxt' is the compose/message message text 'epitxt' is the compose epilogue ***/ if (mode == 1) { /* plain => plain */ fprintf (stdout, "\n\ \n\
\n\ \n", 10, 1, stdout); } /*****************************************************************************/ /* Elementary UTF-8 encoding for characters > 127. */ char* HtmlUtf8Encode (char *TextPtr) { int ch, cnt; uchar *cptr, *sptr, *tptr; /*********/ /* begin */ /*********/ if (Debug) fprintf (stdout, "HtmlUtf8Encode() |%s|\n", TextPtr); for (cptr = (uchar*)TextPtr; *cptr && *cptr <= 127; cptr++); if (!*cptr) return (TextPtr); cnt = cptr - (uchar*)TextPtr; /* worst case for UTF-8 is plus 3 chars */ for (cptr = (uchar*)TextPtr; *cptr && *cptr > 127; cptr++) cnt += 3; tptr = CgiLibVeeMemCalloc (cnt+4); if (!tptr) ErrorExit (vaxc$errno, FI_LI); sptr = tptr; cptr = (uchar*)TextPtr; for (cptr = (uchar*)TextPtr; *cptr; cptr++) { if (*cptr <= 127) *sptr++ = *cptr; else sptr += HtmlUtf8EncodeChar (sptr, (uint)*cptr); } *sptr = '\0'; if (Debug) fprintf (stdout, "|%s|\n", tptr); return ((char*)tptr); } /*****************************************************************************/ /* UTF-8 encode a single character. Return the number of chars generated. */ int HtmlUtf8EncodeChar (uchar *out, uint utf) { /*********/ /* begin */ /*********/ if (utf <= 0x7F) { /* plain ASCII */ out[0] = (char) utf; out[1] = 0; return (1); } else if (utf <= 0x07FF) { /* 2-byte unicode */ out[0] = (char) (((utf >> 6) & 0x1F) | 0xC0); out[1] = (char) (((utf >> 0) & 0x3F) | 0x80); out[2] = 0; return (2); } else if (utf <= 0xFFFF) { /* 3-byte unicode */ out[0] = (char) (((utf >> 12) & 0x0F) | 0xE0); out[1] = (char) (((utf >> 6) & 0x3F) | 0x80); out[2] = (char) (((utf >> 0) & 0x3F) | 0x80); out[3] = 0; return (3); } else if (utf <= 0x10FFFF) { /* 4-byte unicode */ out[0] = (char) (((utf >> 18) & 0x07) | 0xF0); out[1] = (char) (((utf >> 12) & 0x3F) | 0x80); out[2] = (char) (((utf >> 6) & 0x3F) | 0x80); out[3] = (char) (((utf >> 0) & 0x3F) | 0x80); out[4] = 0; return (4); } else { /* error - use replacement character */ out[0] = (char) 0xEF; out[1] = (char) 0xBF; out[2] = (char) 0xBD; out[3] = 0; return (0); } } /*****************************************************************************/ /* Check the leading text for what looks like a common HTML tag. */ BOOL HtmlInDisguise (char *TextPtr) { char *cptr; /*********/ /* begin */ /*********/ if (Debug) fprintf (stdout, "HtmlInDisguise()\n"); if (!TextPtr) return (FALSE); for (cptr = TextPtr; *cptr && isspace(*cptr); cptr++); if (WatchEnabled) WatchThis (FI_LI, "!8AZ", cptr); if (*cptr != '<') return (FALSE); if (!memcmp (cptr, "