Added punycode conversion placeholders and...

internal functions that provide conversion and tests for those per #120.
2014-09-09 14:27:24 -04:00 · 2014-09-09 14:27:24 -04:00 · e3c2c0734f
parent 3d8aa78c0a
commit e3c2c0734f
2 changed files with 251 additions and 0 deletions
--- a/punycode.go
+++ b/punycode.go
@ -0,0 +1,212 @@
+package dns
+
+import (
+	"bytes"
+	"unicode"
+)
+
+// See http://tools.ietf.org/html/rfc3492
+// Implementation idea from RFC itself and from from IDNA::Punycode created by
+// Tatsuhiko Miyagawa <miyagawa@bulknews.net> in 2002
+
+const (
+	_MIN  = '\u0001'
+	_MAX  = '\u001a' // 26
+	_SKEW = '\u0026' // 38
+	_DAMP = '\u02BC' // 700
+	_BASE = '\u0024' // 36
+	_BIAS = '\u0048' // 72
+	_N    = '\u0080' // 128
+
+	Delimiter = '-'
+	Prefix    = "xn--"
+)
+
+func IdnToASCII(string) string {
+	return ""
+}
+
+func IdnFromASCII(string) string {
+	return ""
+}
+
+// digit_value convert single byte into meaningful value that's used to calculate decoded unicode character.
+func digit_value(code rune) rune {
+	switch {
+	case code >= 'A' && code <= 'Z':
+		return code - 'A'
+	case code >= 'a' && code <= 'z':
+		return code - 'a'
+	case code >= '0' && code <= '9':
+		return code - '0' + 26
+	}
+	panic("never happens")
+}
+
+// code_point finds BASE36 byte (a-z0-9) based on calculated number.
+func code_point(digit rune) rune {
+	switch {
+	case digit >= 0 && digit <= 25:
+		return digit + 'a'
+	case digit >= 26 && digit <= 36:
+		return digit - 26 + '0'
+	}
+	panic("never happens")
+}
+
+// adapt calculates next bias to be used for next iteration delta
+func adapt_bias(delta rune, numpoints rune, firsttime bool) rune {
+	if firsttime {
+		delta /= _DAMP
+	} else {
+		delta /= 2
+	}
+
+	var k rune
+	for delta = delta + delta/numpoints; delta > (_BASE-_MIN)*_MAX/2; k += _BASE {
+		if _BASE <= _MIN {
+			panic("1")
+		}
+		delta /= _BASE - _MIN
+	}
+
+	return k + ((_BASE-_MIN+1)*delta)/(delta+_SKEW)
+}
+
+// next finds minimal rune (one with lowest codepoint value) that should be equal or above boundary.
+func next(b []rune, boundary rune) rune {
+	if len(b) == 0 {
+		panic("invalid set of runes to determine next one")
+	}
+	m := b[0]
+	for _, x := range b[1:] {
+		if x >= boundary && (m < boundary || x < m) {
+			m = x
+		}
+	}
+	return m
+}
+
+// PrepRune should do actions recommended by stringprep (RFC3491) for each unicode char. TODO(asergeyev): work on actual implementation, currently just lowercases Unicode chars.
+func PrepRune(r rune) rune {
+	if unicode.IsUpper(r) {
+		r = unicode.ToLower(r)
+	}
+	return r
+}
+
+// tfunc is a function that helps calculate each character weight
+func tfunc(k, bias rune) rune {
+	switch {
+	case k <= bias:
+		return _MIN
+	case k >= bias+_MAX:
+		return _MAX
+	}
+	return k - bias
+}
+
+// encode_punycode transforms Unicode input bytes (that represent DNS label) into punycode bytestream
+func encode_punycode(input []byte) []byte {
+	n, delta, bias := _N, rune(0), _BIAS
+
+	b := bytes.Runes(input)
+	for i := range b {
+		b[i] = PrepRune(b[i])
+	}
+
+	basic := make([]byte, 0, len(b))
+	for _, ltr := range b {
+		if ltr <= 0x7f {
+			basic = append(basic, byte(ltr))
+		}
+	}
+	basiclen := rune(len(basic))
+	fulllen := rune(len(b))
+	if basiclen == fulllen {
+		return basic
+	}
+
+	var out bytes.Buffer
+
+	out.WriteString(Prefix)
+	if basiclen > 0 {
+		out.Write(basic)
+		out.WriteByte(Delimiter)
+	}
+
+	for h := basiclen; h < fulllen; n, delta = n+1, delta+1 {
+		next := next(b, n)
+		s := &bytes.Buffer{}
+		s.WriteRune(next)
+		delta, n = delta+(next-n)*(h+1), next
+
+		for _, ltr := range b {
+			if ltr < n {
+				delta++
+			}
+			if ltr == n {
+				q := delta
+				for k := _BASE; ; k += _BASE {
+					t := tfunc(k, bias)
+					if q < t {
+						break
+					}
+					cp := t + ((q - t) % (_BASE - t))
+					out.WriteRune(code_point(cp))
+					q = (q - t) / (_BASE - t)
+				}
+
+				out.WriteRune(code_point(q))
+
+				bias = adapt_bias(delta, h+1, h == basiclen)
+				h, delta = h+1, 0
+			}
+		}
+	}
+	return out.Bytes()
+}
+
+// encode_punycode transforms punycode input bytes (that represent DNS label) into Unicode bytestream
+func decode_punycode(b []byte) []byte {
+	n, bias := _N, _BIAS
+	if !bytes.HasPrefix(b, []byte(Prefix)) {
+		return b
+	}
+	out := make([]rune, 0, len(b))
+	b = b[len(Prefix):]
+	pos := bytes.Index(b, []byte{Delimiter})
+	if pos >= 0 {
+		out = append(out, bytes.Runes(b[:pos])...)
+		b = b[pos+1:] // trim source string
+	}
+	for i := rune(0); len(b) > 0; i++ {
+		oldi, w, ch := i, rune(1), byte(0)
+		for k := _BASE; ; k += _BASE {
+			ch, b = b[0], b[1:]
+			digit := digit_value(rune(ch))
+			i += digit * w
+
+			t := tfunc(k, bias)
+			if digit < t {
+				break
+			}
+
+			w *= _BASE - t
+		}
+		ln := rune(len(out) + 1)
+		bias = adapt_bias(i-oldi, ln, oldi == 0)
+		n += i / ln
+		i = i % ln
+		// insert
+		out = append(out, 0)
+		copy(out[i+1:], out[i:])
+		out[i] = n
+	}
+
+	var ret bytes.Buffer
+	for _, r := range out {
+		ret.WriteRune(r)
+	}
+	return ret.Bytes()
+}
--- a/punycode_test.go
+++ b/punycode_test.go
@ -0,0 +1,39 @@
+package dns
+
+import (
+	"strings"
+	"testing"
+)
+
+var testcases = [][2]string{
+	{"a", "a"},
+	{"A-B", "a-b"},
+	{"AbC", "abc"},
+	{"я", "xn--41a"},
+	{"zя", "xn--z-0ub"},
+	{"ЯZ", "xn--z-zub"},
+	{"إختبار", "xn--kgbechtv"},
+	{"آزمایشی", "xn--hgbk6aj7f53bba"},
+	{"测试", "xn--0zwm56d"},
+	{"測試", "xn--g6w251d"},
+	{"Испытание", "xn--80akhbyknj4f"},
+	{"परीक्षा", "xn--11b5bs3a9aj6g"},
+	{"δοκιμή", "xn--jxalpdlp"},
+	{"테스트", "xn--9t4b11yi5a"},
+	{"טעסט", "xn--deba0ad"},
+	{"テスト", "xn--zckzah"},
+	{"பரிட்சை", "xn--hlcj6aya9esc7a"},
+}
+
+func TestEncodePunycode(t *testing.T) {
+	for _, tst := range testcases {
+		enc := encode_punycode([]byte(tst[0]))
+		if string(enc) != tst[1] {
+			t.Errorf("%s encodeded as %s but should be %s", tst[0], enc, tst[1])
+		}
+		dec := decode_punycode([]byte(tst[1]))
+		if string(dec) != strings.ToLower(tst[0]) {
+			t.Errorf("%s decoded as %s but should be %s", tst[1], dec, strings.ToLower(tst[0]))
+		}
+	}
+}