现在的反爬措施越来越复杂,一些常见的反爬措施有Headers and referer反爬机制、语音动作识别、各种变态二维码(12306)、字符js加密、Ajax动态加载、模拟登录和cookie限制。。。

目标网站:微信公众平台
反爬机制:用户名不变,密码加密
相关技术:selenium,requests,execjs
技术难度:★★☆☆☆☆
爬取目标:模拟登陆公共号,获取页面的所有信息
源代码:本页面(selenium代码还在测试更新当中)
说明:第一次玩js逆向爬虫,如有不足,多多指教

1.分析js,找到目标接口函数

F12打开 开发者工具, 按照一般逻辑步骤,先输入一个错误账号密码,找到真正的入口接口函数,案例如下:

继续向下翻动,我们可以看到请求参数表单数据的pwd密码是明显经过加密的

2.破解

在Search功能栏下搜索pwd 很快可以定位到以下代码

1
2
3
4
5
6
7
8
9
10
_loginPost: function(e, i) {
n.post({
url: i.url,
data: {
username: i.account,
pwd: o(i.pwd.substr(0, 16)), // 关键代码
imgcode: i.verify,
f: "json",
userlang: i.currentLang,
redire

我们在pwd: o(i.pwd.substr(0, 16))行处下断点。然后步入进去.

1
2
3
4
5
6
7
t.exports = function(n, r, t) {
return r ? t ? c(r, n) : function(n, r) {
return e(c(n, r))
}(r, n) : t ? o(n) : function(n) {
return e(o(n)) // *****
}(n)
}

继续步入一次,代码最后会走到我标****的地方。这就是我们需要的关键函数,扣代码吧。

扣下代码。然后自己编写一个Js function

1
2
3
function JsKiller(pwd){
return e(o(pwd));
}

js逆向加密源代码:pwd.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
function d(n, r) {
var t = (65535 & n) + (65535 & r);
return (n >> 16) + (r >> 16) + (t >> 16) << 16 | 65535 & t
}

function f(n, r, t, e, u, o) {
return d(function (n, r) {
return n << r | n >>> 32 - r
}(d(d(r, n), d(e, o)), u), t)
}

function g(n, r, t, e, u, o, c) {
return f(r & t | ~r & e, n, r, u, o, c)
}

function l(n, r, t, e, u, o, c) {
return f(r & e | t & ~e, n, r, u, o, c)
}

function v(n, r, t, e, u, o, c) {
return f(r ^ t ^ e, n, r, u, o, c)
}

function s(n, r, t, e, u, o, c) {
return f(t ^ (r | ~e), n, r, u, o, c)
}

function i(n, r) {
n[r >> 5] |= 128 << r % 32;
n[14 + (r + 64 >>> 9 << 4)] = r;
var t, e, u, o, c, f = 1732584193, i = -271733879, a = -1732584194, h = 271733878;
for (t = 0; t < n.length; t += 16)
i = s(i = s(i = s(i = s(i = v(i = v(i = v(i = v(i = l(i = l(i = l(i = l(i = g(i = g(i = g(i = g(u = i, a = g(o = a, h = g(c = h, f = g(e = f, i, a, h, n[t], 7, -680876936), i, a, n[t + 1], 12, -389564586), f, i, n[t + 2], 17, 606105819), h, f, n[t + 3], 22, -1044525330), a = g(a, h = g(h, f = g(f, i, a, h, n[t + 4], 7, -176418897), i, a, n[t + 5], 12, 1200080426), f, i, n[t + 6], 17, -1473231341), h, f, n[t + 7], 22, -45705983), a = g(a, h = g(h, f = g(f, i, a, h, n[t + 8], 7, 1770035416), i, a, n[t + 9], 12, -1958414417), f, i, n[t + 10], 17, -42063), h, f, n[t + 11], 22, -1990404162), a = g(a, h = g(h, f = g(f, i, a, h, n[t + 12], 7, 1804603682), i, a, n[t + 13], 12, -40341101), f, i, n[t + 14], 17, -1502002290), h, f, n[t + 15], 22, 1236535329), a = l(a, h = l(h, f = l(f, i, a, h, n[t + 1], 5, -165796510), i, a, n[t + 6], 9, -1069501632), f, i, n[t + 11], 14, 643717713), h, f, n[t], 20, -373897302), a = l(a, h = l(h, f = l(f, i, a, h, n[t + 5], 5, -701558691), i, a, n[t + 10], 9, 38016083), f, i, n[t + 15], 14, -660478335), h, f, n[t + 4], 20, -405537848), a = l(a, h = l(h, f = l(f, i, a, h, n[t + 9], 5, 568446438), i, a, n[t + 14], 9, -1019803690), f, i, n[t + 3], 14, -187363961), h, f, n[t + 8], 20, 1163531501), a = l(a, h = l(h, f = l(f, i, a, h, n[t + 13], 5, -1444681467), i, a, n[t + 2], 9, -51403784), f, i, n[t + 7], 14, 1735328473), h, f, n[t + 12], 20, -1926607734), a = v(a, h = v(h, f = v(f, i, a, h, n[t + 5], 4, -378558), i, a, n[t + 8], 11, -2022574463), f, i, n[t + 11], 16, 1839030562), h, f, n[t + 14], 23, -35309556), a = v(a, h = v(h, f = v(f, i, a, h, n[t + 1], 4, -1530992060), i, a, n[t + 4], 11, 1272893353), f, i, n[t + 7], 16, -155497632), h, f, n[t + 10], 23, -1094730640), a = v(a, h = v(h, f = v(f, i, a, h, n[t + 13], 4, 681279174), i, a, n[t], 11, -358537222), f, i, n[t + 3], 16, -722521979), h, f, n[t + 6], 23, 76029189), a = v(a, h = v(h, f = v(f, i, a, h, n[t + 9], 4, -640364487), i, a, n[t + 12], 11, -421815835), f, i, n[t + 15], 16, 530742520), h, f, n[t + 2], 23, -995338651), a = s(a, h = s(h, f = s(f, i, a, h, n[t], 6, -198630844), i, a, n[t + 7], 10, 1126891415), f, i, n[t + 14], 15, -1416354905), h, f, n[t + 5], 21, -57434055), a = s(a, h = s(h, f = s(f, i, a, h, n[t + 12], 6, 1700485571), i, a, n[t + 3], 10, -1894986606), f, i, n[t + 10], 15, -1051523), h, f, n[t + 1], 21, -2054922799), a = s(a, h = s(h, f = s(f, i, a, h, n[t + 8], 6, 1873313359), i, a, n[t + 15], 10, -30611744), f, i, n[t + 6], 15, -1560198380), h, f, n[t + 13], 21, 1309151649), a = s(a, h = s(h, f = s(f, i, a, h, n[t + 4], 6, -145523070), i, a, n[t + 11], 10, -1120210379), f, i, n[t + 2], 15, 718787259), h, f, n[t + 9], 21, -343485551),
f = d(f, e);
i = d(i, u);
a = d(a, o);
h = d(h, c);
return [f, i, a, h]
}

function a(n) {
var r, t = "";
for (r = 0; r < 32 * n.length; r += 8)
t += String.fromCharCode(n[r >> 5] >>> r % 32 & 255);
return t
}

function h(n) {
var r, t = [];
for (t[(n.length >> 2) - 1] = void 0,
r = 0; r < t.length; r += 1)
t[r] = 0;
for (r = 0; r < 8 * n.length; r += 8)
t[r >> 5] |= (255 & n.charCodeAt(r / 8)) << r % 32;
return t
}

function e(n) {
var r, t, e = "0123456789abcdef", u = "";
for (t = 0; t < n.length; t += 1)
r = n.charCodeAt(t);
u += e.charAt(r >>> 4 & 15) + e.charAt(15 & r);
return u
}

function u(n) {
return unescape(encodeURIComponent(n))
}

function o(n) {
return function (n) {
return a(i(h(n), 8 * n.length))
}(u(n))
}

function c(n, r) {
return function (n, r) {
var t, e, u = h(n), o = [], c = [];
for (o[15] = c[15] = void 0,
16 < u.length && (u = i(u, 8 * n.length)),
t = 0; t < 16; t += 1)
o[t] = 909522486 ^ u[t];
c[t] = 1549556828 ^ u[t];
return e = i(o.concat(h(r)), 512 + 8 * r.length);
a(i(c.concat(e), 640))
}(u(n), u(r))
}

function JsKiller(pwd) {
return e(o(pwd))
}

3.执行破解出的js

1
2
3
4
5
6
7
8
9
10
11
12
import execjs


def parseJS(pwd):
# 读取pwd.js文件内容
with open("pwd.static", 'r') as f:
js_code = f.read()
# 编译js函数
ctx = execjs.compile(js_code)
# 执行js中的JsKiller函数,参数为pwd
result = ctx.call('JsKiller', pwd)
return result

比如我们来执行一下这个python代码,测试一下结果:令pwd=”Github”
ctx.call(‘JsKiller’, pwd) ==> "e1adbcbb92c622d0b3e619f9d0730abf”,即将GitHub加密成32位。

4.模拟登陆

这里直接列出来模拟登录的源代码,这里需要注意一定要加referer

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import execjs
import requests

url = "https://mp.weixin.qq.com/cgi-bin/bizlogin?action=startlogin" # 这就是真正的入口url


def get_content(username, pwd):
headers = {
"referer": "https://mp.weixin.qq.com/", # 必须要写
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
}
data = {
"username": username,
"pwd": pwd,
"f": "json",
"imgcode": "",
"userlang": "zh_CN",
"lang": "zh_CN",
"ajax": 1
}
try:
response = requests.post(url=url, headers=headers, data=data, timeout=10)
if (response.status_code == 200):
return response.text
return None
except Exception as e:
print(e)


def parseJS(pwd):
with open("pwd.static", 'r') as f:
js_code = f.read()

ctx = execjs.compile(js_code)
result = ctx.call('JsKiller', pwd)
return result


if __name__ == '__main__':
username = "980710425@qq.com"
pwd = "XXXXXXXX"
pwd = parseJS(pwd)
print(get_content(username, pwd))

5.结果分析

得到返回结果:这里返回了一个json格式数据,并且我们可以得知,这里其实是重定向到了这个新的URL地址

1
2
3
4
5
{"base_resp":{
"err_msg":"ok",
"ret":0},
"redirect_url":"/cgi-bin/bizlogin?action=validate&lang=zh_CN&account=980710425%40qq.com"
}

https://mp.weixin.qq.com/cgi-bin/bizlogin?action=validate&lang=zh_CN&account=980710425%40qq.com
这个链接其实是需要微信扫码登陆认证的链接,我也不知道怎么去绕过这个扫码认证,如果有大神巨佬知道,请带带我。我的笨办法是通过selenium自动化测试,还是不可避免要扫码,然后扫码登陆,获取目标数据

未完待续。。。


 评论

联系我 | Contact with me

Copyright © 2019-2020 谁知你知我,我知你知深。此恨经年深,比情度日久

博客内容遵循 署名-非商业性使用-相同方式共享 4.0 国际 (CC BY-NC-SA 4.0) 协议