-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathparser.go
194 lines (146 loc) · 4.83 KB
/
parser.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
// Copyright 2017 hIMEI
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/** file parser.go contains data types and methods for HTML content parsing */
package main
import (
"strings"
"github.com/antchfx/htmlquery"
"golang.org/x/net/html"
)
// Parser is a html and xpath parser
type Parser struct{}
// NewParser creates instance of Parser
func NewParser() *Parser {
parser := &Parser{}
return parser
}
// FindEntry finds html element on the page
func findEntry(node *html.Node, entryexp string) *html.Node {
return htmlquery.FindOne(node, entryexp)
}
// FindEntrys finds set of html elements on the page
func findEntrys(node *html.Node, entryexp string) []*html.Node {
return htmlquery.Find(node, entryexp)
}
// GetTag gets inner value of html tag
func getTag(node *html.Node, tagexp string) string {
return htmlquery.InnerText(findEntry(node, tagexp))
}
// GetHref gets content of href attribute of <a> tag
func getHref(node *html.Node) string {
return htmlquery.SelectAttr(node, HREF)
}
// UnMap extracts key and value from given map. Returns key's string and value's *html.Node
func unMap(nodeMap map[string]*html.Node) (string, *html.Node) {
var str string
var node *html.Node
for key, value := range nodeMap {
str = key
node = value
}
return str, node
}
// CheckPage returns true if page is a root page and false if it is a host details page
func (p *Parser) checkPage(node *html.Node) bool {
ch := false
result := findEntry(node, SEARCHRESULT)
if result != nil {
ch = true
}
return ch
}
// ParseOne parses given *html.Node and creates slice of *Host
func (p *Parser) parseOne(node map[string]*html.Node, chanHost chan []*Host) {
var hosts []*Host
url, hostNode := unMap(node)
hostsNodes := p.getHosts(hostNode)
for _, h := range hostsNodes {
fields := p.getHostFields(h)
fields = append(fields, trimString(url))
var services []*Service
detailslink := getHref(findEntry(h, DETAILS))
req := NewRequest(detailslink)
chanNode := getContents(req.RequestStrings[0])
dnode := <-chanNode
srvNodes := findEntrys(dnode, SERVICELONG)
for _, srv := range srvNodes {
srvFields := p.getServiceFields(srv)
service := NewService(srvFields)
services = append(services, service)
}
host := NewHost(fields, services)
hosts = append(hosts, host)
}
chanHost <- hosts
return
}
// GetHostFields collects all data for Host struct creating
// and returns it as []string
func (p *Parser) getHostFields(node *html.Node) []string {
var fields []string
hostUrl := getHref(findEntry(findEntry(findEntry(node, SUMMARY), ONION), LINK))
fields = append(fields, hostUrl)
addDate := strings.TrimPrefix(getTag(findEntry(node, SUMMARY), SPAN), ADDED)
fields = append(fields, addDate)
return fields
}
// GetTotal gets results total number
func (p *Parser) getTotal(root *html.Node) string {
total := trimString(getTag(root, TOTAL))
return total
}
// GetServiceFields collects all data for Service struct creating
// and returns it as []string
func (p *Parser) getServiceFields(node *html.Node) []string {
var fields []string
// Service name
if findEntry(node, H3) != nil {
fields = append(fields, trimString(getTag(node, H3)))
} else {
fields = append(fields, trimString(getTag(node, STATE)))
}
// Service port
fields = append(fields, trimString(getTag(node, PORT)))
// Service protocol
fields = append(fields, trimString(getTag(node, PROTO)))
// Service state
fields = append(fields, trimString(getTag(node, STATE)))
// Service version
if findEntry(node, VERSION) != nil {
fields = append(fields, trimString(getTag(node, VERSION)))
} else {
fields = append(fields, "unknown VERSION")
}
// Service details, e.g. ServDetails
pre := p.getPre(node)
fields = append(fields, pre)
return fields
}
//GetPre gets <pre> tag's content of parsed page and trims all "\t" symbols from it.
func (p *Parser) getPre(node *html.Node) string {
pre := getTag(node, PRE)
splitted := strings.Split(pre, "\n")
for i := range splitted {
splitted[i] = trimString(strings.TrimPrefix(splitted[i], "\t"))
}
newPre := strings.Join(splitted, "\n")
return newPre
}
// GetServices gets <div>'s of class "service"
func (p *Parser) getService(node *html.Node) []*html.Node {
return findEntrys(node, SERVICE)
}
// GetHosts gets data to instantiate Host structs
func (p *Parser) getHosts(node *html.Node) []*html.Node {
return findEntrys(node, HOST)
}