diff options
| author | Ambrose Li <ambrose.li@gmail.com> | 2020-08-25 03:48:09 +0000 | 
|---|---|---|
| committer | Ambrose Li <ambrose.li@gmail.com> | 2020-08-25 03:48:09 +0000 | 
| commit | 48c9ec565d0a8e147adb61eb79777812688bfbaa (patch) | |
| tree | d249686b321e535348c1426eddfa10b610ef5858 | |
| parent | Update ChangeLog (diff) | |
| download | w3m-48c9ec565d0a8e147adb61eb79777812688bfbaa.tar.gz w3m-48c9ec565d0a8e147adb61eb79777812688bfbaa.zip  | |
In HTML5 anchors should not be closed when encountering divs, for example, but should be closed when encountering buttons, for example. Many sites that use HTML5-style anchors end up having links displayed with zero-length link texts. The proposed patch correct this behaviour by detecting whether the document is HTML5, then suppressing the close-anchor action in CLOSE_A if it's an HTML5 document. A new macro handles the HTML5-specific cases where anchors are not already always closed.
This also fixes a bug in the tokenizing FSM in etc.c that prevented the !doctype element from being recognized; the fix is necessary because HTML5 detection depends on checking the !doctype element.
| -rw-r--r-- | etc.c | 5 | ||||
| -rw-r--r-- | file.c | 25 | ||||
| -rw-r--r-- | fm.h | 3 | ||||
| -rw-r--r-- | html.c | 6 | ||||
| -rw-r--r-- | html.h | 1 | ||||
| -rw-r--r-- | tests/a1.expected | 2 | ||||
| -rw-r--r-- | tests/a1.html | 1 | ||||
| -rw-r--r-- | tests/a2.expected | 1 | ||||
| -rw-r--r-- | tests/a2.html | 3 | ||||
| -rw-r--r-- | tests/run_tests | 31 | 
10 files changed, 72 insertions, 6 deletions
@@ -727,6 +727,11 @@ next_status(char c, int *status)  	case '>':  	    *status = R_ST_NORMAL;  	    break; +	case 'D': +	case 'd': +	    /* could be a !doctype */ +	    *status = R_ST_TAG; +	    break;  	default:  	    *status = R_ST_IRRTAG;  	} @@ -1,4 +1,5 @@  /* $Id: file.c,v 1.266 2012/05/22 09:45:56 inu Exp $ */ +/* vi: set sw=4 ts=8 ai sm noet : */  #include "fm.h"  #include <sys/types.h>  #include "myctype.h" @@ -4322,9 +4323,18 @@ process_idattr(struct readbuffer *obuf, int cmd, struct parsed_tag *tag)        obuf->flag &= ~RB_P;\      } -#define CLOSE_A \ -    CLOSE_P; \ -    close_anchor(h_env, obuf); +#define HTML5_CLOSE_A do { \ +	if (obuf->flag & RB_HTML5) { \ +	    close_anchor(h_env, obuf); \ +	} \ +    } while (0) + +#define CLOSE_A do { \ +	CLOSE_P; \ +	if (!(obuf->flag & RB_HTML5)) { \ +	    close_anchor(h_env, obuf); \ +	} \ +    } while (0)  #define CLOSE_DT \      if (obuf->flag & RB_IN_DT) { \ @@ -4930,6 +4940,8 @@ HTMLtagproc1(struct parsed_tag *tag, struct html_feed_environ *h_env)  	close_anchor(h_env, obuf);  	return 1;      case HTML_IMG: +	if (parsedtag_exists(tag, ATTR_USEMAP)) +	    HTML5_CLOSE_A;  	tmp = process_img(tag, h_env->limit);  	HTMLlineproc1(tmp->ptr, h_env);  	return 1; @@ -5125,6 +5137,7 @@ HTMLtagproc1(struct parsed_tag *tag, struct html_feed_environ *h_env)             HTMLlineproc1(tmp->ptr, h_env);         return 1;      case HTML_BUTTON: +       HTML5_CLOSE_A;         tmp = process_button(tag);         if (tmp)             HTMLlineproc1(tmp->ptr, h_env); @@ -5180,6 +5193,11 @@ HTMLtagproc1(struct parsed_tag *tag, struct html_feed_environ *h_env)  			     NULL);  	HTMLlineproc1(tmp->ptr, h_env);  	return 1; +    case HTML_DOCTYPE: +	if (!parsedtag_exists(tag, ATTR_PUBLIC)) { +	    obuf->flag |= RB_HTML5; +	} +	return 1;      case HTML_META:  	p = q = r = NULL;  	parsedtag_get_value(tag, ATTR_HTTP_EQUIV, &p); @@ -5378,6 +5396,7 @@ HTMLtagproc1(struct parsed_tag *tag, struct html_feed_environ *h_env)  	}  	return 1;      case HTML_EMBED: +	HTML5_CLOSE_A;  	if (view_unseenobject) {  	    if (parsedtag_get_value(tag, ATTR_SRC, &p)) {  		Str s; @@ -661,6 +661,7 @@ struct readbuffer {  #endif				/* FORMAT_NICE */  #define RB_DEL		0x100000  #define RB_S		0x200000 +#define RB_HTML5	0x400000  #define RB_GET_ALIGN(obuf) ((obuf)->flag&RB_ALIGN)  #define RB_SET_ALIGN(obuf,align) {(obuf)->flag &= ~RB_ALIGN; (obuf)->flag |= (align); } @@ -673,7 +674,7 @@ struct readbuffer {     RB_SET_ALIGN(obuf,(obuf)->flag_stack[--(obuf)->flag_sp]); \  } -/* status flags */ +/* state of token scanning finite state machine */  #define R_ST_NORMAL 0		/* normal */  #define R_ST_TAG0   1		/* within tag, just after < */  #define R_ST_TAG    2		/* within tag */ @@ -37,6 +37,8 @@ unsigned char ALST_TABLE[] =      ATTR_CELLPADDING, ATTR_VSPACE, ATTR_CORE  };  #define MAXA_TABLE	MAXA_CORE + 6 +unsigned char ALST_DOCTYPE[] = { ATTR_PUBLIC }; /* only (html and) public should be checked */ +#define MAXA_DOCTYPE	1  unsigned char ALST_META[] = { ATTR_HTTP_EQUIV, ATTR_CONTENT, ATTR_CHARSET, ATTR_CORE };  #define MAXA_META	MAXA_CORE + 3  unsigned char ALST_FRAME[] = { ATTR_SRC, ATTR_NAME, ATTR_CORE }; @@ -221,7 +223,7 @@ TagInfo TagMAP[MAX_HTMLTAG] = {      {"/option", NULL, 0, TFLG_END},	/*  94 HTML_N_OPTION   */      {"head", ALST_NOP, MAXA_NOP, 0},	/*  95 HTML_HEAD       */      {"/head", NULL, 0, TFLG_END},	/*  96 HTML_N_HEAD     */ -    {"doctype", ALST_NOP, MAXA_NOP, 0},	/*  97 HTML_DOCTYPE    */ +    {"doctype", ALST_DOCTYPE, MAXA_DOCTYPE, 0},	/*  97 HTML_DOCTYPE    */      {"noframes", ALST_NOFRAMES, MAXA_NOFRAMES, 0},	/*  98 HTML_NOFRAMES   */      {"/noframes", NULL, 0, TFLG_END},	/*  99 HTML_N_NOFRAMES */ @@ -367,7 +369,7 @@ TagAttrInfo AttrMAP[MAX_TAGATTR] = {      {"rev", VTYPE_STR, 0},	/* 48 ATTR_REV            */      {"title", VTYPE_STR, 0},	/* 49 ATTR_TITLE          */      {"accesskey", VTYPE_STR, 0},	/* 50 ATTR_ACCESSKEY          */ -    {NULL, VTYPE_NONE, 0},	/* 51 Undefined           */ +    {"public", VTYPE_NONE, 0},	/* 51 ATTR_PUBLIC         */      {NULL, VTYPE_NONE, 0},	/* 52 Undefined           */      {NULL, VTYPE_NONE, 0},	/* 53 Undefined           */      {NULL, VTYPE_NONE, 0},	/* 54 Undefined           */ @@ -318,6 +318,7 @@ typedef struct {  #define ATTR_REV		48  #define ATTR_TITLE		49  #define ATTR_ACCESSKEY		50 +#define ATTR_PUBLIC		51  /* Internal attribute */  #define ATTR_XOFFSET		60 diff --git a/tests/a1.expected b/tests/a1.expected new file mode 100644 index 0000000..5812232 --- /dev/null +++ b/tests/a1.expected @@ -0,0 +1,2 @@ +  +test diff --git a/tests/a1.html b/tests/a1.html new file mode 100644 index 0000000..7e89006 --- /dev/null +++ b/tests/a1.html @@ -0,0 +1 @@ +<a href="example"><div>test</div></a> diff --git a/tests/a2.expected b/tests/a2.expected new file mode 100644 index 0000000..9daeafb --- /dev/null +++ b/tests/a2.expected @@ -0,0 +1 @@ +test diff --git a/tests/a2.html b/tests/a2.html new file mode 100644 index 0000000..197b006 --- /dev/null +++ b/tests/a2.html @@ -0,0 +1,3 @@ +<!doctype html> +<meta charset=utf-8> +<a href="example"><div>test</div></a> diff --git a/tests/run_tests b/tests/run_tests new file mode 100644 index 0000000..0ec3080 --- /dev/null +++ b/tests/run_tests @@ -0,0 +1,31 @@ +total=0 +pass=0 +fail=0 +w3m="../w3m +-config +/dev/null +-o +ignore_null_img_alt=false" +for i in *.html; do +	cmd="$w3m +-I +utf-8 +-O +utf-8 +-T +text/html" +	opts="`basename "$i" .html`.opts" +	test -f "$opts" && cmd="$cmd +`grep -v '^#' $opts`" +	if (set -x;IFS=' +';$cmd) < "$i" | diff -u - "`basename "$i" .html`.expected"; then +		pass="`expr 1 + "$pass"`" +	else +		fail="`expr 1 + "$fail"`" +	fi +	total="`expr 1 + "$total"`" +done +echo "TOTAL: $total test(s)" +echo "PASS : $pass" +echo "FAIL : $fail" +test 0 -eq "$fail"  | 
