checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
---
2
2
SHA256:
3
- metadata.gz: b724a350c5e935bf7e26c0bf214299497b1703e23da67aa9acbd5d9619cfefe1
4
- data.tar.gz: 6c0e0603c12e386c459b3d584e7baa53f79668c64cc29f2cbaaa01a6a8fcb80b
3
+ metadata.gz: 40f6c6a9c5d3225fe8b8c293e05306e8d1776c3648f82b4f7f8387310a55ec2d
4
+ data.tar.gz: ed8a2164e15d5d9f74672434cda4ee93975bfd8f80cb65b295451509bf25a6f9
5
5
SHA512:
6
- metadata.gz: bc60489507fb0b01c6a449431cc9e29caa7f5f2ffe476acd4bf61615e2e016c7e5c75da2a0c11d44b21763d8618be2ff146a9cf061858e4f99f8399dbeef5898
7
- data.tar.gz: 9bce15fbd2529d5f7d9dcc2f9ef7984cb5208f2835d8325129099bde3e0a9dc85566ef0a89be293e738d88abe02d6318b96b547f106619abe9dbb42437c8cc53
6
+ metadata.gz: 345b32732cb544585a3a12c7c6107ab8011c64ea1938043c82c74b66a0f2a12c96c5911c1533e6819c6d6809ddc73c95801c94faaf6f4dea0ddcf0fa58ab2594
7
+ data.tar.gz: e250dad59f16ae47d3866b0b5b9b9254d1441e2ce0c30fe8f689e83f1c07136b364f9a8c84b8e257c942f00c58400bdb5283aed99695036a22e0f9604a04194f
data/.idea/workspace.xml CHANGED
@@ -4,7 +4,6 @@
4
4
<list default="true" id="07223dd4-8944-486b-a29b-7461a5c9ec2d" name="Default" comment="">
5
5
<change beforePath="$PROJECT_DIR#x2F;.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR#x2F;.idea/workspace.xml" afterDir="false" />
6
6
<change beforePath="$PROJECT_DIR#x2F;lib/http_crawler/client.rb" beforeDir="false" afterPath="$PROJECT_DIR#x2F;lib/http_crawler/client.rb" afterDir="false" />
7
- <change beforePath="$PROJECT_DIR#x2F;lib/http_crawler/common/string.rb" beforeDir="false" afterPath="$PROJECT_DIR#x2F;lib/http_crawler/common/string.rb" afterDir="false" />
8
7
<change beforePath="$PROJECT_DIR#x2F;lib/http_crawler/version.rb" beforeDir="false" afterPath="$PROJECT_DIR#x2F;lib/http_crawler/version.rb" afterDir="false" />
9
8
</list>
10
9
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
@@ -25,7 +24,7 @@
25
24
</provider>
26
25
</entry>
27
26
</file>
28
- <file leaf-file-name="version.rb" pinned="false" current-in-tab="true">
27
+ <file leaf-file-name="version.rb" pinned="false" current-in-tab="false">
29
28
<entry file="file://$PROJECT_DIR#x2F;lib/http_crawler/version.rb">
30
29
<provider selected="true" editor-type-id="text-editor">
31
30
<state relative-caret-position="18">
@@ -38,7 +37,7 @@
38
37
<entry file="file://$PROJECT_DIR#x2F;lib/http_crawler/http/response.rb">
39
38
<provider selected="true" editor-type-id="text-editor">
40
39
<state relative-caret-position="2250">
41
- <caret line="125" column="6" lean-forward="true" selection-start-line="125" selection-start-column="6" selection-end-line="125" selection-end-column="26" />
40
+ <caret line="125" column="6" lean-forward="true" selection-start-line="125" selection-start-column="6" selection-end-line="125" selection-end-column="8" />
42
41
</state>
43
42
</provider>
44
43
</entry>
@@ -93,11 +92,11 @@
93
92
<provider selected="true" editor-type-id="text-editor" />
94
93
</entry>
95
94
</file>
96
- <file leaf-file-name="client.rb" pinned="false" current-in-tab="false">
95
+ <file leaf-file-name="client.rb" pinned="false" current-in-tab="true">
97
96
<entry file="file://$PROJECT_DIR#x2F;lib/http_crawler/client.rb">
98
97
<provider selected="true" editor-type-id="text-editor">
99
- <state relative-caret-position="374">
100
- <caret line="278" column="75" lean-forward="true" selection-start-line="278" selection-start-column="75" selection-end-line="278" selection-end-column="75" />
98
+ <state relative-caret-position="273">
99
+ <caret line="212" column="11" lean-forward="true" selection-start-line="212" selection-start-column="11" selection-end-line="212" selection-end-column="11" />
101
100
</state>
102
101
</provider>
103
102
</entry>
@@ -152,8 +151,8 @@
152
151
<option value="$PROJECT_DIR#x2F;lib/http_crawler.rb" />
153
152
<option value="$PROJECT_DIR#x2F;lib/http_crawler/decryption.rb" />
154
153
<option value="$PROJECT_DIR#x2F;lib/http_crawler/common/string.rb" />
155
- <option value="$PROJECT_DIR#x2F;lib/http_crawler/client.rb" />
156
154
<option value="$PROJECT_DIR#x2F;lib/http_crawler/version.rb" />
155
+ <option value="$PROJECT_DIR#x2F;lib/http_crawler/client.rb" />
157
156
</list>
158
157
</option>
159
158
</component>
@@ -166,7 +165,7 @@
166
165
<component name="NodePackageJsonFileManager">
167
166
<packageJsonPaths />
168
167
</component>
169
- <component name="ProjectFrameBounds" fullScreen="true">
168
+ <component name="ProjectFrameBounds" extendedState="6" fullScreen="true">
170
169
<option name="y" value="23" />
171
170
<option name="width" value="1280" />
172
171
<option name="height" value="777" />
@@ -176,6 +175,7 @@
176
175
<foldersAlwaysOnTop value="true" />
177
176
</navigator>
178
177
<panes>
178
+ <pane id="Scope" />
179
179
<pane id="ProjectPane">
180
180
<subPane>
181
181
<expand>
@@ -198,7 +198,6 @@
198
198
<select />
199
199
</subPane>
200
200
</pane>
201
- <pane id="Scope" />
202
201
</panes>
203
202
</component>
204
203
<component name="PropertiesComponent">
@@ -254,27 +253,29 @@
254
253
<workItem from="1557137463254" duration="382000" />
255
254
<workItem from="1557156104186" duration="1815000" />
256
255
<workItem from="1557160216202" duration="138000" />
256
+ <workItem from="1563360666497" duration="431000" />
257
+ <workItem from="1563361538580" duration="6000" />
257
258
</task>
258
259
<servers />
259
260
</component>
260
261
<component name="TimeTrackingManager">
261
- <option name="totallyTimeSpent" value="33028000" />
262
+ <option name="totallyTimeSpent" value="33465000" />
262
263
</component>
263
264
<component name="ToolWindowManager">
264
- <frame x="0" y="0" width="1680" height="1050" extended-state="0" />
265
+ <frame x="0" y="0" width="1680" height="1050" extended-state="6" />
265
266
<layout>
266
- <window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.16422467" />
267
+ <window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.0964591" />
267
268
<window_info anchor="bottom" id="TODO" order="6" />
268
269
<window_info anchor="bottom" id="Docker" order="7" show_stripe_button="false" />
269
270
<window_info anchor="bottom" id="Event Log" order="7" side_tool="true" />
270
271
<window_info anchor="right" id="Database" order="3" />
271
272
<window_info anchor="bottom" id="Database Changes" order="7" show_stripe_button="false" />
272
- <window_info anchor="bottom" id="Run" order="2" />
273
273
<window_info anchor="bottom" id="Version Control" order="7" />
274
+ <window_info anchor="bottom" id="Run" order="2" />
274
275
<window_info id="Structure" order="1" side_tool="true" weight="0.25" />
275
- <window_info active="true" anchor="bottom" id="Terminal" order="7" visible="true" weight="0.22871795" />
276
+ <window_info anchor="bottom" id="Terminal" order="7" visible="true" weight="0.11794872" />
276
- <window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
277
277
<window_info id="Favorites" order="2" side_tool="true" />
278
+ <window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
278
279
<window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
279
280
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
280
281
<window_info anchor="right" id="Commander" order="0" weight="0.4" />
@@ -568,14 +569,7 @@
568
569
<entry file="file://$PROJECT_DIR#x2F;lib/http_crawler/http/response.rb">
569
570
<provider selected="true" editor-type-id="text-editor">
570
571
<state relative-caret-position="2250">
571
- <caret line="125" column="6" lean-forward="true" selection-start-line="125" selection-start-column="6" selection-end-line="125" selection-end-column="26" />
572
+ <caret line="125" column="6" lean-forward="true" selection-start-line="125" selection-start-column="6" selection-end-line="125" selection-end-column="8" />
572
- </state>
573
- </provider>
574
- </entry>
575
- <entry file="file://$PROJECT_DIR#x2F;lib/http_crawler/client.rb">
576
- <provider selected="true" editor-type-id="text-editor">
577
- <state relative-caret-position="374">
578
- <caret line="278" column="75" lean-forward="true" selection-start-line="278" selection-start-column="75" selection-end-line="278" selection-end-column="75" />
579
573
</state>
580
574
</provider>
581
575
</entry>
@@ -593,5 +587,12 @@
593
587
</state>
594
588
</provider>
595
589
</entry>
590
+ <entry file="file://$PROJECT_DIR#x2F;lib/http_crawler/client.rb">
591
+ <provider selected="true" editor-type-id="text-editor">
592
+ <state relative-caret-position="273">
593
+ <caret line="212" column="11" lean-forward="true" selection-start-line="212" selection-start-column="11" selection-end-line="212" selection-end-column="11" />
594
+ </state>
595
+ </provider>
596
+ </entry>
596
597
</component>
597
598
</project>
data/lib/http_crawler/client.rb CHANGED
@@ -9,8 +9,8 @@ module HttpCrawler
9
9
# web_name = "biquge_duquanben"
10
10
# 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例
11
11
#
12
- def for(web_name)
13
- "HttpCrawler::Web::#{web_name.camelize}::Client".constantize.new()
12
+ def for(web_name, args = {})
13
+ "HttpCrawler::Web::#{web_name.camelize}::Client".constantize.new(args)
14
14
end
15
15
16
16
#
@@ -18,8 +18,8 @@ module HttpCrawler
18
18
# module_name = "HttpCrawler::Web::BiqugeDuquanben"
19
19
# 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例
20
20
#
21
- def for_module(module_name, *args)
22
- "#{module_name}::Client".constantize.new()
21
+ def for_module(module_name, args = {})
22
+ "#{module_name}::Client".constantize.new(args)
23
23
end
24
24
25
25
def for_uri(path)
@@ -54,6 +54,7 @@ module HttpCrawler
54
54
# 初始化一些 client 自定义参数
55
55
init_client
56
56
57
+ self.redirect = true
57
58
# 初始化 代理参数
58
59
@proxy_params = {key: "#{self.class.to_s.gsub(":", "_")}"}
59
60
end
@@ -61,7 +62,7 @@ module HttpCrawler
61
62
attr_accessor :max_error_num
62
63
# 最大错误重试次数
63
64
def max_error_num
64
- @max_error_num ||= 1
65
+ @max_error_num ||= 2
65
66
end
66
67
67
68
attr_reader :uri
@@ -109,6 +110,8 @@ module HttpCrawler
109
110
end
110
111
end
111
112
113
+ attr_accessor :redirect
114
+
112
115
attr_accessor :header
113
116
# 头文件相关方法
114
117
def header(parameter = {})
@@ -147,6 +150,11 @@ module HttpCrawler
147
150
148
151
def update_cookies(parameter = {})
149
152
parameter = parameter.symbolize_keys
153
+
154
+ @response.cookies.each do |cookie|
155
+ @cookies.add(cookie)
156
+ end unless @response.blank?
157
+
150
158
nil
151
159
end
152
160
@@ -206,6 +214,8 @@ module HttpCrawler
206
214
proxy_client = HttpCrawler::Proxy.for(proxy_api)
207
215
proxy_r = proxy_client.get_proxy(proxy_params.symbolize_keys)
208
216
proxy_ip = proxy_r.results unless proxy_r.results.blank?
217
+ # 测试本地代理
218
+ # proxy_ip = {p_addr: "127.0.0.1", p_port: 8888} if "production" =! Rails.env
209
219
if proxy_ip.blank?
210
220
Rails.logger.warn "无最新代理等待5秒后重新获取:proxy 为空"
211
221
else
@@ -242,10 +252,19 @@ module HttpCrawler
242
252
nil
243
253
end
244
254
245
- # 初始化http请求前置条件
246
- def http
255
+ # 创建时间: 2019/9/11 17:11
256
+ # 更新时间: 2019/9/11
257
+ # 作者: Jagger
258
+ # 方法名称: init_http
259
+ # 方法说明: 初始化http请求前置条件
260
+ # 调用方式: #init_http
261
+ #
262
+ # @return HTTP
263
+ #
264
+ def init_http
265
+ h = HTTP
247
266
# 自动重定向。最大重定向次数 max_hops: 5
248
- h = HTTP.follow(max_hops: 5)
267
+ h = h.follow(max_hops: 5) if self.redirect == true
249
268
250
269
# 添加代理
251
270
h = h.via(@proxy[:p_addr], @proxy[:p_port].to_i, @proxy[:p_user], @proxy[:p_pass]) unless (@proxy.blank?)
@@ -268,6 +287,11 @@ module HttpCrawler
268
287
h
269
288
end
270
289
290
+ # 初始化http请求前置条件
291
+ def http
292
+ init_http
293
+ end
294
+
271
295
272
296
# 发送 get 请求
273
297
def get(path, params = {}, limit = 3)
@@ -289,7 +313,7 @@ module HttpCrawler
289
313
end
290
314
291
315
# 发送 post 请求
292
- def post(path, params = {},format = :form)
316
+ def post(path, params = {}, format = :form)
293
317
raise "Client uri为空" unless self.uri
294
318
request {http.post((self.uri + path).to_s, format => params, :ssl_context => @ctx)}
295
319
end
@@ -320,29 +344,22 @@ module HttpCrawler
320
344
n = max_error_num
321
345
begin
322
346
r = block.call
323
- if r.status.success?
347
+ if r.status.success? || (redirect == false && r.status.redirect?)
324
348
return r
325
349
else
326
350
raise "请求失败(#{r.code}):#{r.uri.to_s}"
327
351
end
328
352
rescue => error
329
353
Rails.logger.debug error.class
330
- case error
331
- when HTTP::TimeoutError
332
- # 超时错误切换代理
333
- if self.update_proxy?
354
+ # 错误尝试次数
355
+ if n <= 0
356
+ # 错误尝试次数小于等于0就结束尝试
357
+ raise error
334
- retry
335
- else
336
- raise error
337
- end
338
358
else
339
- # 错误尝试次数
340
- if n <= 0
341
- raise error
342
- else
359
+ # 每次错误次数尝试 -1
360
+ n -= 1
361
+ # self.update_proxy?
362
+ retry
343
- n -= 1
344
- retry
345
- end
346
363
end
347
364
end
348
365
end # def request(&block)
data/lib/http_crawler/http/response.rb CHANGED
@@ -8,9 +8,7 @@ module HTTP
8
8
# 数据解压
9
9
case self.headers['Content-Encoding']
10
10
when 'gzip' then
11
- sio = StringIO.new(self.body.to_s)
11
+ @decoding_body = Zlib::GzipReader.new(StringIO.new(self.body.to_s), encoding: "ASCII-8BIT").read
12
- gz = Zlib::GzipReader.new(sio)
13
- @decoding_body = gz.read()
14
12
when 'br'
15
13
@decoding_body = Brotli.inflate(self.body.to_s)
16
14
# when 'deflate'
@@ -35,12 +33,12 @@ module HTTP
35
33
36
34
# 进行转码
37
35
begin
38
- @decoding_body.force_encoding(encoding).encode!('utf-8') if encoding && encoding != @decoding_body.encoding
36
+ @decoding_body.force_encoding(encoding).encode!('utf-8',invalid: :replace) if encoding && encoding != @decoding_body.encoding
39
37
rescue => e
40
38
# 转码错误后再次使用 CharDet 判断编码格式后进行转码
41
39
cd = CharDet.detect(@decoding_body)["encoding"]
42
40
if (cd && cd != encoding)
43
- @decoding_body.force_encoding(cd).encode!('utf-8') if encoding != @decoding_body.encoding
41
+ @decoding_body.force_encoding(cd).encode!('utf-8',invalid: :replace) if encoding != @decoding_body.encoding
44
42
else
45
43
# 还是转码错误则抛出源码转字符串内容
46
44
self.body.to_s
data/lib/http_crawler/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
module HttpCrawler
2
- VERSION = "0.3.1.25"
2
+ VERSION = "0.3.1.30"
3
3
end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
--- !ruby/object:Gem::Specification
2
2
name: http_crawler
3
3
version: !ruby/object:Gem::Version
4
- version: 0.3.1.25
4
+ version: 0.3.1.30
5
5
platform: ruby
6
6
authors:
7
7
- jagger
8
8
autorequire:
9
9
bindir: exe
10
10
cert_chain: []
11
- date: 2019-05-18 00:00:00.000000000 Z
11
+ date: 2019-09-11 00:00:00.000000000 Z
12
12
dependencies:
13
13
- !ruby/object:Gem::Dependency
14
14
name: rspec