aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAmbrose Li <ambrose.li@gmail.com>2020-08-24 02:20:43 +0000
committerAmbrose Li <ambrose.li@gmail.com>2020-08-24 02:20:43 +0000
commit9f18e67a9bb5ff0387f76e5a2870b49558f868ad (patch)
treefeb45c9c4e518c0ab784b82b25b6ea85158e0455
parentUpdate ChangeLog (diff)
downloadw3m-9f18e67a9bb5ff0387f76e5a2870b49558f868ad.tar.gz
w3m-9f18e67a9bb5ff0387f76e5a2870b49558f868ad.zip
Cleaned version of 20200823_q branch. Changes the behaviour of the q tag (when m17n and Unicode are configured) to use "smart" quotes if the display charset can handle them. Falls back to old behaviour (ASCII quotes with left/right quote semantics for 6/0 and 2/6) if display charset is us-ascii.
Also changes the behaviour of conv_entity() to convert left/right quotes and some dashes because named entities are needed for the new code for the q tag.
Diffstat (limited to '')
-rw-r--r--entity.c14
-rw-r--r--file.c18
-rw-r--r--fm.h1
-rw-r--r--tests/name_entity_1.expected2
-rw-r--r--tests/name_entity_1.html2
-rw-r--r--tests/name_entity_1.opts2
-rw-r--r--tests/name_entity_2.expected1
-rw-r--r--tests/name_entity_2.html1
-rw-r--r--tests/q1.expected1
-rw-r--r--tests/q1.html2
-rw-r--r--tests/q1.opts2
-rw-r--r--tests/q2.expected1
-rw-r--r--tests/q2.html3
-rw-r--r--tests/q3.expected1
-rw-r--r--tests/q3.html3
-rw-r--r--tests/q3.opts4
-rw-r--r--tests/q4.expected1
-rw-r--r--tests/q4.html3
-rw-r--r--tests/q4.opts2
-rw-r--r--tests/q5.expected1
-rw-r--r--tests/q5.html3
-rw-r--r--tests/q6.expected1
-rw-r--r--tests/q6.html3
-rw-r--r--tests/q6.opts1
-rw-r--r--tests/run_tests31
25 files changed, 103 insertions, 1 deletions
diff --git a/entity.c b/entity.c
index 45dc95e..67b8cfb 100644
--- a/entity.c
+++ b/entity.c
@@ -58,11 +58,23 @@ conv_entity(unsigned int c)
#ifdef USE_M17N
#ifdef USE_UNICODE
if (c <= WC_C_UCS4_END) { /* Unicode */
+ char *chk;
wc_uchar utf8[7];
wc_ucs_to_utf8(c, utf8);
- return wc_conv((char *)utf8, WC_CES_UTF_8, InnerCharset)->ptr;
+ /* we eventually need to display it so check DisplayCharset */
+ chk = wc_conv((char *)utf8, WC_CES_UTF_8, DisplayCharset ? DisplayCharset : WC_CES_US_ASCII)->ptr;
+ if (strcmp(chk, "?") != 0)
+ return wc_conv((char *)utf8, WC_CES_UTF_8, InnerCharset)->ptr;
}
#endif
#endif
+ if (c == 0x201c || c == 0x201f || c == 0x201d || c == 0x2033)
+ return "\"";
+ if (c == 0x2018 || c == 0x201b || c == 0x2019 || c == 0x2032)
+ return "'";
+ if (c >= 0x2010 && c < 0x2014)
+ return "-";
+ if (c == 0x2014)
+ return "--";
return "?";
}
diff --git a/file.c b/file.c
index c0fc044..cf7a931 100644
--- a/file.c
+++ b/file.c
@@ -4487,9 +4487,27 @@ HTMLtagproc1(struct parsed_tag *tag, struct html_feed_environ *h_env)
HTMLlineproc1("</b>", h_env);
return 1;
case HTML_Q:
+#ifdef USE_M17N
+#ifdef USE_UNICODE
+ if (DisplayCharset != WC_CES_US_ASCII) {
+ HTMLlineproc1((obuf->q_level & 1 ? "&ldquo;": "&lsquo;"), h_env);
+ obuf->q_level += 1;
+ }
+ else
+#endif
+#endif
HTMLlineproc1("`", h_env);
return 1;
case HTML_N_Q:
+#ifdef USE_M17N
+#ifdef USE_UNICODE
+ if (DisplayCharset != WC_CES_US_ASCII) {
+ obuf->q_level -= 1;
+ HTMLlineproc1((obuf->q_level & 1 ? "&rdquo;": "&rsquo;"), h_env);
+ }
+ else
+#endif
+#endif
HTMLlineproc1("'", h_env);
return 1;
case HTML_FIGURE:
diff --git a/fm.h b/fm.h
index 4a17ecc..6ce0f19 100644
--- a/fm.h
+++ b/fm.h
@@ -610,6 +610,7 @@ struct readbuffer {
int flag_sp;
int status;
unsigned char end_tag;
+ unsigned char q_level;
short table_level;
short nobr_level;
Anchor anchor;
diff --git a/tests/name_entity_1.expected b/tests/name_entity_1.expected
new file mode 100644
index 0000000..2a6fd2c
--- /dev/null
+++ b/tests/name_entity_1.expected
@@ -0,0 +1,2 @@
+This is an example sentence that contains some "quoted words" --
+punctuation that would be displayed as question marks but should not.
diff --git a/tests/name_entity_1.html b/tests/name_entity_1.html
new file mode 100644
index 0000000..f2e3633
--- /dev/null
+++ b/tests/name_entity_1.html
@@ -0,0 +1,2 @@
+This is an example sentence that contains some &ldquo;quoted words&rdquo; &mdash;
+<br>punctuation that would be displayed as question marks but should not.
diff --git a/tests/name_entity_1.opts b/tests/name_entity_1.opts
new file mode 100644
index 0000000..f9ac4b0
--- /dev/null
+++ b/tests/name_entity_1.opts
@@ -0,0 +1,2 @@
+-O
+us-ascii
diff --git a/tests/name_entity_2.expected b/tests/name_entity_2.expected
new file mode 100644
index 0000000..b287794
--- /dev/null
+++ b/tests/name_entity_2.expected
@@ -0,0 +1 @@
+2πr
diff --git a/tests/name_entity_2.html b/tests/name_entity_2.html
new file mode 100644
index 0000000..9be30f0
--- /dev/null
+++ b/tests/name_entity_2.html
@@ -0,0 +1 @@
+2&pi;r
diff --git a/tests/q1.expected b/tests/q1.expected
new file mode 100644
index 0000000..84b67a1
--- /dev/null
+++ b/tests/q1.expected
@@ -0,0 +1 @@
+`test'
diff --git a/tests/q1.html b/tests/q1.html
new file mode 100644
index 0000000..292b019
--- /dev/null
+++ b/tests/q1.html
@@ -0,0 +1,2 @@
+<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+<q>test</q>
diff --git a/tests/q1.opts b/tests/q1.opts
new file mode 100644
index 0000000..f9ac4b0
--- /dev/null
+++ b/tests/q1.opts
@@ -0,0 +1,2 @@
+-O
+us-ascii
diff --git a/tests/q2.expected b/tests/q2.expected
new file mode 100644
index 0000000..bd72179
--- /dev/null
+++ b/tests/q2.expected
@@ -0,0 +1 @@
+“test”
diff --git a/tests/q2.html b/tests/q2.html
new file mode 100644
index 0000000..8f1e80b
--- /dev/null
+++ b/tests/q2.html
@@ -0,0 +1,3 @@
+<!doctype html>
+<meta charset=utf-8>
+<q>test</q>
diff --git a/tests/q3.expected b/tests/q3.expected
new file mode 100644
index 0000000..c5e67df
--- /dev/null
+++ b/tests/q3.expected
@@ -0,0 +1 @@
+test
diff --git a/tests/q3.html b/tests/q3.html
new file mode 100644
index 0000000..672a868
--- /dev/null
+++ b/tests/q3.html
@@ -0,0 +1,3 @@
+<!doctype html>
+<meta charset=Big5>
+<q>test</q>
diff --git a/tests/q3.opts b/tests/q3.opts
new file mode 100644
index 0000000..32d2b1a
--- /dev/null
+++ b/tests/q3.opts
@@ -0,0 +1,4 @@
+-I
+windows-1252
+-O
+windows-1252
diff --git a/tests/q4.expected b/tests/q4.expected
new file mode 100644
index 0000000..c5e67df
--- /dev/null
+++ b/tests/q4.expected
@@ -0,0 +1 @@
+test
diff --git a/tests/q4.html b/tests/q4.html
new file mode 100644
index 0000000..672a868
--- /dev/null
+++ b/tests/q4.html
@@ -0,0 +1,3 @@
+<!doctype html>
+<meta charset=Big5>
+<q>test</q>
diff --git a/tests/q4.opts b/tests/q4.opts
new file mode 100644
index 0000000..074570d
--- /dev/null
+++ b/tests/q4.opts
@@ -0,0 +1,2 @@
+-O
+windows-1252
diff --git a/tests/q5.expected b/tests/q5.expected
new file mode 100644
index 0000000..4f4614d
--- /dev/null
+++ b/tests/q5.expected
@@ -0,0 +1 @@
+“example of a ‘nested’ quote”
diff --git a/tests/q5.html b/tests/q5.html
new file mode 100644
index 0000000..f4fe761
--- /dev/null
+++ b/tests/q5.html
@@ -0,0 +1,3 @@
+<!doctype html>
+<meta charset=utf-8>
+<q>example of a <q>nested</q> quote</q>
diff --git a/tests/q6.expected b/tests/q6.expected
new file mode 100644
index 0000000..fc98f57
--- /dev/null
+++ b/tests/q6.expected
@@ -0,0 +1 @@
+"example of a 'nested' quote"
diff --git a/tests/q6.html b/tests/q6.html
new file mode 100644
index 0000000..e5f1de5
--- /dev/null
+++ b/tests/q6.html
@@ -0,0 +1,3 @@
+<!doctype html>
+<meta charset=big5>
+<q>example of a <q>nested</q> quote</q>
diff --git a/tests/q6.opts b/tests/q6.opts
new file mode 100644
index 0000000..1b59fbf
--- /dev/null
+++ b/tests/q6.opts
@@ -0,0 +1 @@
+-O Big5
diff --git a/tests/run_tests b/tests/run_tests
new file mode 100644
index 0000000..0ec3080
--- /dev/null
+++ b/tests/run_tests
@@ -0,0 +1,31 @@
+total=0
+pass=0
+fail=0
+w3m="../w3m
+-config
+/dev/null
+-o
+ignore_null_img_alt=false"
+for i in *.html; do
+ cmd="$w3m
+-I
+utf-8
+-O
+utf-8
+-T
+text/html"
+ opts="`basename "$i" .html`.opts"
+ test -f "$opts" && cmd="$cmd
+`grep -v '^#' $opts`"
+ if (set -x;IFS='
+';$cmd) < "$i" | diff -u - "`basename "$i" .html`.expected"; then
+ pass="`expr 1 + "$pass"`"
+ else
+ fail="`expr 1 + "$fail"`"
+ fi
+ total="`expr 1 + "$total"`"
+done
+echo "TOTAL: $total test(s)"
+echo "PASS : $pass"
+echo "FAIL : $fail"
+test 0 -eq "$fail"