dovecot-2.2: liblib: Added generic URI parsing functions.

Sat Jun 2 19:02:27 EEST 2012

details:   http://hg.dovecot.org/dovecot-2.2/rev/ba36e4380cf4
changeset: 14587:ba36e4380cf4
user:      Stephan Bosch <stephan at rename-it.nl>
date:      Sat Jun 02 17:06:21 2012 +0300
description:
liblib: Added generic URI parsing functions.

diffstat:

 src/lib/Makefile.am |    2 +
 src/lib/uri-util.c  |  723 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/lib/uri-util.h  |   49 +++
 3 files changed, 774 insertions(+), 0 deletions(-)

diffs (truncated from 799 to 300 lines):

diff -r 21d67121985a -r ba36e4380cf4 src/lib/Makefile.am

--- a/src/lib/Makefile.am	Sat Jun 02 16:55:21 2012 +0300
+++ b/src/lib/Makefile.am	Sat Jun 02 17:06:21 2012 +0300
@@ -121,6 +121,7 @@
 	unlink-directory.c \
 	unlink-old-files.c \
 	unichar.c \
+	uri-util.c \
 	utc-offset.c \
 	utc-mktime.c \
 	var-expand.c \
@@ -228,6 +229,7 @@
 	unlink-directory.h \
 	unlink-old-files.h \
 	unichar.h \
+	uri-util.h \
 	utc-offset.h \
 	utc-mktime.h \
 	var-expand.h \
diff -r 21d67121985a -r ba36e4380cf4 src/lib/uri-util.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib/uri-util.c	Sat Jun 02 17:06:21 2012 +0300
@@ -0,0 +1,723 @@
+/* Copyright (c) 2010-2012 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "array.h"
+#include "str.h"
+#include "network.h"
+#include "uri-util.h"
+
+#include <ctype.h>
+
+/*
+ * Generic URI parsing.
+ *
+ * [URI-GEN] RFC3986 Appendix A:
+ *
+ * host             = IP-literal / IPv4address / reg-name
+ * port             = *DIGIT
+ * reg-name         = *( unreserved / pct-encoded / sub-delims )
+ * unreserved       = ALPHA / DIGIT / "-" / "." / "_" / "~"
+ * pct-encoded      = "%" HEXDIG HEXDIG
+ * sub-delims       = "!" / "$" / "&" / "'" / "(" / ")"
+ *                  / "*" / "+" / "," / ";" / "="
+ * IP-literal       = "[" ( IPv6address / IPvFuture  ) "]"
+ * IPvFuture        = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
+ * IPv6address      =                            6( h16 ":" ) ls32
+ *                  /                       "::" 5( h16 ":" ) ls32
+ *                  / [               h16 ] "::" 4( h16 ":" ) ls32
+ *                  / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
+ *                  / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
+ *                  / [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
+ *                  / [ *4( h16 ":" ) h16 ] "::"              ls32
+ *                  / [ *5( h16 ":" ) h16 ] "::"              h16
+ *                  / [ *6( h16 ":" ) h16 ] "::"
+ * h16              = 1*4HEXDIG
+ * ls32             = ( h16 ":" h16 ) / IPv4address
+ * IPv4address      = dec-octet "." dec-octet "." dec-octet "." dec-octet
+ * dec-octet        = DIGIT                 ; 0-9
+ *                  / %x31-39 DIGIT         ; 10-99
+ *                  / "1" 2DIGIT            ; 100-199
+ *                  / "2" %x30-34 DIGIT     ; 200-249
+ *                  / "25" %x30-35          ; 250-255
+ */
+
+#define URI_MAX_SCHEME_NAME_LEN 64
+
+/* Character lookup table
+ *
+ * unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"     [bit0]
+ * sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
+ *               / "*" / "+" / "," / ";" / "="               [bit1]
+ * gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"   [bit2]
+ * pchar         = unreserved / sub-delims / ":" / "@"       [bit0|bit1|bit3]
+ * 'uchar'       = unreserved / sub-delims / ":"             [bit0|bit1|bit4]
+ * 'fchar'       = pchar / "/" / "?"                    [bit0|bit1|bit3|bit5]
+ *
+ */
+
+static unsigned const char _uri_char_lookup[256] = {
+	 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  // 00
+	 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  // 10
+	 0,  2,  0,  4,  2,  0,  2,  2,  2,  2,  2,  2,  2,  1,  1, 36,  // 20
+	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 28,  2,  0,  2,  0, 36,  // 30
+	12,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 40
+	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  4,  0,  4,  0,  1,  // 50
+	 0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 60
+	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  0,  // 70
+};
+
+static inline int _decode_hex_digit(const unsigned char digit)
+{
+	switch (digit) {
+	case '0': case '1': case '2': case '3': case '4':
+	case '5': case '6': case '7': case '8': case '9':
+		return digit - '0';
+
+	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+		return digit - 'a' + 0x0a;
+
+	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+		return digit - 'A' + 0x0A;
+	}
+	return -1;
+}
+
+static int
+uri_parse_pct_encoded(struct uri_parser *parser, const unsigned char **p,
+		      const unsigned char *pend, unsigned char *ch_r)
+{
+	int value;
+
+	if (**p == 0 || *(*p+1) == 0 || (pend != NULL && *p+1 >= pend)) {
+		parser->error = "Unexpected URI boundary after '%'";
+		return -1;
+	}
+
+	if ((value = _decode_hex_digit(**p)) < 0) {
+		parser->error = t_strdup_printf(
+			"Expecting hex digit after '%%', but found '%c'", **p);
+		return -1;
+	}
+	
+	*ch_r = (value & 0x0f) << 4;
+	*p += 1;
+	
+	if ((value = _decode_hex_digit(**p)) < 0) {
+		parser->error = t_strdup_printf(
+			"Expecting hex digit after '%%%c', but found '%c'",	*((*p)-1), **p);
+		return -1;
+	}
+
+	*ch_r |= (value & 0x0f);
+	*p += 1;
+
+	if (*ch_r == '\0') {
+		parser->error =
+			"Percent encoding is not allowed to encode NUL character";
+		return -1;
+	}
+	return 1;	
+}
+
+static int
+uri_parse_unreserved_char(struct uri_parser *parser, unsigned char *ch_r)
+{
+	if (*parser->cur == '%') {
+		parser->cur++;
+		if (uri_parse_pct_encoded(parser, &parser->cur,
+					  parser->end, ch_r) <= 0)
+			return -1;
+		return 1;
+	}
+
+	if ((*parser->cur & 0x80) != 0)
+		return 0;
+
+	if (_uri_char_lookup[*parser->cur] & 0x01) {
+		*ch_r = *parser->cur;
+		parser->cur++;
+		return 1;
+	}			
+	return 0;
+}
+
+int uri_parse_unreserved(struct uri_parser *parser, string_t *part)
+{
+	int len = 0;
+
+	while (parser->cur < parser->end) {
+		int ret;
+		unsigned char ch = 0;
+
+		if ((ret = uri_parse_unreserved_char(parser, &ch)) < 0)
+			return -1;
+	
+		if (ret == 0)
+			break;
+
+		if (part != NULL)
+			str_append_c(part, ch);
+		len++;
+	}
+
+	return len > 0 ? 1 : 0;
+}
+
+bool uri_data_decode(struct uri_parser *parser, const char *data,
+		     const char *until, const char **decoded_r)
+{
+	const unsigned char *p = (const unsigned char *)data;
+	const unsigned char *pend = (const unsigned char *)until;
+	string_t *decoded;
+
+	if (pend == NULL) {
+		/* NULL means unlimited; solely rely on '\0' */
+		pend = (const unsigned char *)(size_t)-1;
+	}
+	
+	if (p >= pend || *p == '\0') {
+		if (decoded_r != NULL)
+			*decoded_r = "";
+		return TRUE;
+	}
+	
+	decoded = uri_parser_get_tmpbuf(parser, 256);
+	while (p < pend && *p != '\0') {
+		unsigned char ch;
+
+		if (*p == '%') {
+			p++;
+			if (uri_parse_pct_encoded(parser, &p, NULL, &ch) <= 0)
+				return FALSE;
+
+			str_append_c(decoded, ch);
+		} else {
+			str_append_c(decoded, *p);
+			p++;
+		}
+	}
+
+	if (decoded_r != NULL)
+		*decoded_r = t_strdup(str_c(decoded));
+	return TRUE;
+}
+
+const char *uri_cut_scheme(const char **uri_p)
+{
+	const char *p = *uri_p;
+	const char *scheme;
+	size_t len = 1;
+	
+	/* RFC 3968:
+	 *   scheme  = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
+	 */
+	
+	if (!i_isalpha(*p))
+		return NULL;		
+	p++;
+		
+	while (len < URI_MAX_SCHEME_NAME_LEN && *p != '\0') {			
+		if (!i_isalnum(*p) && *p != '+' && *p != '-' && *p != '.')
+			break;
+		p++;
+		len++;
+	}
+	
+	if (*p != ':')
+		return NULL;
+	
+	scheme = t_strdup_until(*uri_p, p);
+	*uri_p = p + 1;
+
+	return scheme;
+}
+
+int uri_parse_scheme(struct uri_parser *parser, const char **scheme_r)
+{
+	const char *p;
+
+	if (parser->cur >= parser->end)
+		return 0;
+
+	p = (const char *)parser->cur;
+	if ((*scheme_r = uri_cut_scheme(&p)) == NULL)
+		return 0;
+
+	parser->cur = (const unsigned char *)p;
+	return 1;
+}
+
+static int
+uri_parse_dec_octet(struct uri_parser *parser, string_t *literal,
+		    uint8_t *octet_r)
+{
+	uint8_t octet = 0;
+	int count = 0;
+
+	/* RFC 3986:
+	 *
+	 * dec-octet     = DIGIT                 ; 0-9
+	 *               / %x31-39 DIGIT         ; 10-99
+	 *               / "1" 2DIGIT            ; 100-199
+	 *               / "2" %x30-34 DIGIT     ; 200-249
+	 *               / "25" %x30-35          ; 250-255
+	 */
+
+	while (parser->cur < parser->end && i_isdigit(*parser->cur)) {
+		uint8_t prev = octet;
+
+		octet = octet * 10 + (uint8_t)(parser->cur[0] - '0');
+		if (octet < prev)
+			return -1;
+
+		if (literal != NULL)
+			str_append_c(literal, *parser->cur);
+
+		parser->cur++;
+		count++;