-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathgourlnormalize.go
119 lines (95 loc) · 2.3 KB
/
gourlnormalize.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
/*
@ go-url-normalize[RFC 3986]
@ Normalize URL | Count unique normalized URL | Count unique normalized URL per TLD
*/
package normalizer
import (
"fmt"
"net/url"
"strings"
"golang.org/x/net/idna"
"regexp"
)
/*
@ CountUniqueNormalizedUrls counts unique normalized urls
@ calls NormalizeURL()
@ Ex: ["https://example.com?a=1&b=2", "https://example.com?b=2&a=1"] returns 1
@ As these 2 urls are same after they are normalized
*/
func CountUniqueNormalizedUrls(urls []string) int {
output := make(map[string]int)
if len(urls) == 0{
return 0
}
for _, x := range urls{
normalizedText, _ := NormalizeURL(x)
output[normalizedText] = 1
}
return len(output)
}
/*
@ CountUniqueNormalizedUrls counts unique normalized url per TLD
@ calls NormalizeURL()
@ Ex: ["https://example.com", "https://subdomain.example.com"] returns map["example.com" => 2]
*/
func CountUniqueNormalizedUrlsPerTopLevelDomain(urls []string) map[string]int {
output := make(map[string]int)
if len(urls) == 0{
return output
}
for _, x := range urls{
output[fetchTLD(x)] = output[fetchTLD(x)] + 1
}
return output
}
// extracts tld(top level domsin) from url
func fetchTLD(domain string) string {
pattern, _ := regexp.Compile(`[^.]*\.[^.]{2,3}(?:\.[^.]{2,3})?$`)
replacer := strings.NewReplacer("http://","", "https://", "")
return replacer.Replace(pattern.FindString(domain))
}
var (
Ports = map[string]int{
"http": 80,
"https": 443,
"ftp": 21,
}
)
/*
@ NormalizeURL() returns RFC-3986 formatted string
@ this method is also being used as helper:
@ CountUniqueNormalizedUrlsPerTopLevelDomain && CountUniqueNormalizedUrls
*/
func NormalizeURL(s string) (string, error) {
s = strings.TrimSpace(s)
u, err := url.Parse(s)
if err != nil {
return s, err
}
if u.Scheme == "" {
u, err = url.Parse("http://" + s)
if err != nil {
return s, err
}
}
if strings.HasPrefix(s, "//") {
s = "http:" + s
}
p, ok := Ports[u.Scheme]
if ok {
u.Host = strings.TrimSuffix(u.Host, fmt.Sprintf(":%d", p))
}
got, err := idna.ToUnicode(u.Host)
if err != nil {
return got, err
} else {
u.Host = got
}
u.Host = strings.TrimPrefix(u.Host, "www.")
v := u.Query()
u.RawQuery = v.Encode()
u.RawQuery, _ = url.QueryUnescape(u.RawQuery)
h := u.String()
h = strings.TrimSuffix(h, "/")
return h, nil
}