首页
文章分类
逆向网安
中英演讲
杂类教程
学习笔记
前端开发
汇编
数据库
.NET
服务器
Python
Java
PHP
Git
算法
安卓开发
生活记录
读书笔记
作品发布
人体健康
网上邻居
留言板
欣赏小姐姐
关于我
Search
登录
1
利用AList搭建家庭个人影音库
4,700 阅读
2
浅尝Restful Fast Request插件,一句话完成 逆向过程
4,070 阅读
3
完美破解The Economist付费墙
2,819 阅读
4
i茅台app接口自动化csharp wpf实现,挂机windows服务器每日自动预约
2,645 阅读
5
青龙面板基本使用并添加修改微信/支付宝步数脚本
2,096 阅读
Search
标签搜索
PHP
Laravel
前端
csharp
安卓逆向
JavaScript
Python
Java
爬虫
抓包
Git
winform
android
Fiddler
Vue
selenium
LeetCode
每日一题
简单题
docker
Hygge
累计撰写
98
篇文章
累计收到
446
条评论
首页
栏目
逆向网安
中英演讲
杂类教程
学习笔记
前端开发
汇编
数据库
.NET
服务器
Python
Java
PHP
Git
算法
安卓开发
生活记录
读书笔记
作品发布
人体健康
页面
网上邻居
留言板
欣赏小姐姐
关于我
用户登录
搜索到
1
篇与
的结果
2024-04-26
蚂蚁智学题库爬虫并整理到Excel
接的小私活,目标爬取下来题库并整理到Excel里。目标站点:https://www.mayizhixue.cn/{cloud title="蚂蚁智学" type="bd" url="https://pan.baidu.com/s/1pjg1vTojaazfebpCT1J9SQ?pwd=d2nv" password="d2nv"/}import requests from openpyxl import Workbook from openpyxl.utils import get_column_letter from openpyxl import load_workbook import os common_headers = { 'Authorization': 'TOKEN', } record_id = 1 wb = load_workbook(filename='sample.xlsx') ws = wb.active rows = ws.rows def get_target_row_number(): rows = ws.rows idx = 1 for row in rows: # for cell in row: # print(cell.value, end=' ') # print() if row[0].value is None: return idx idx = idx + 1 return idx def write_row(row, kIndex=None): global record_id rowNumber = get_target_row_number() # print(f"当前行数:{rowNumber}") if kIndex is None: ws.cell(row=rowNumber, column=1).value = record_id else: ws.cell(row=rowNumber, column=1).value = str(record_id) + '.' + str(kIndex) ws.cell(row=rowNumber, column=2).value = row.get('title', '') ws.cell(row=rowNumber, column=3).value = row.get('type', '') # ws.cell(row=rowNumber, column=4).value = row.get('type') # 分数 # ws.cell(row=rowNumber, column=5).value = row.get('type') # 难度 option_idx = 0 for option in row.get('options', []): if 6 + option_idx >= 11: break ws.cell(row=rowNumber, column=6 + option_idx).value = option option_idx = option_idx + 1 ws.cell(row=rowNumber, column=11).value = row.get('answer', '') ws.cell(row=rowNumber, column=12).value = row.get('analysis', '') if kIndex is None: record_id = record_id + 1 def map_type_kv(key): # 1-单选题 2-多选题 6-共享题干题 type = '单选题' if key in ('1', 1): type = '单选题' elif key in ('2', 2): type = '多选题' elif key in ('3', 3): type = '不定项选择题' elif key in ('4', 4): type = '判断题' elif key in ('6', 6): type = '材料题' else: print('不支持的类型:%s' % key) exit() return type def get_test_question(sectionId): params = { 'sectionId': sectionId, 'type': '2', } response = requests.get('https://wx.yiwenjy.cn/yunlian_pc/querySubjectList', params=params, headers=common_headers).json() data = response.get('data') handle_data_2_excel(data) def handle_data_2_excel(data): for i in data: # 此时 i 为对象,取出所有key并遍历 for key in i.keys(): type = map_type_kv(key) # 开始遍历这一题型的所有题目 if type in ['单选题', '多选题', '不定项选择题', '判断题']: for j in i.get(key): row = { 'title': j.get('issue'), 'type': type, 'options': [], 'answer': j.get('answer'), 'analysis': j.get('analysis') } options = j.get('sOption') # 使用|分割选项 options = options.split('|') for k in options: # j为A.选项内容 所以取第三个字符开始 row['options'].append(k[2:]) write_row(row) elif type == '材料题': for j in i.get(key): row = { 'title': j.get('stem'), 'type': type } write_row(row) # 开始爬下面的point kIndex = 1 for k in j.get('childre', []): subtype = map_type_kv(k.get('subType')) row = { 'title': k.get('issue'), 'type': subtype, 'options': [], 'answer': k.get('answer'), 'analysis': k.get('analysis') } options = k.get('sOption') # 使用|分割选项 options = options.split('|') for opt in options: # j为A.选项内容 所以取第三个字符开始 row['options'].append(opt[2:]) write_row(row, kIndex) kIndex = kIndex + 1 def get_exam_question(paperId): response = requests.get( 'https://wx.yiwenjy.cn/yunlian_pc/queryoPaperSubjectList', params={ 'paperId': paperId, 'mode': '2' }, headers=common_headers).json() data = response.get('data') handle_data_2_excel(data) def get_catalogue(courseName, courseId): response = requests.get('https://wx.yiwenjy.cn/yunlian_pc/querySectionList', params={ 'courseId': courseId }, headers=common_headers).json() data = response.get('data') for i in data: print(f"当前章节ID:{i.get('id')},章节名称:{i.get('sectionName')}") # 创建相关文件夹 if not os.path.exists(courseName + '/' + i.get('sectionName')): os.makedirs(courseName + '/' + i.get('sectionName')) # 这里需要一直向下判断是否有子节点 copy_i = i # dfs算法 access_next_level(courseName + "/", copy_i) def access_next_level(path, item): global wb, ws, rows, record_id # dfs算法 开始不断找下级 向上返回 if item.get('children') is not None: path = path + item.get('sectionName') + '/' for i in item.get('children'): access_next_level(path, i) else: print(f"当前小节ID:{item.get('id')},小节名称:{item.get('sectionName')}") record_id = 1 wb = load_workbook(filename='sample.xlsx') ws = wb.active rows = ws.rows get_test_question(item.get('id')) # 判断目录是否存在 if not os.path.exists(path): os.makedirs(path) wb.save(f'{path}/{item.get("sectionName")}.xlsx') def get_product_course_info(id): response = requests.get('https://wx.yiwenjy.cn/yunlian_pc/queryProductCourse', params={ 'id': id }, headers=common_headers).json() data = response.get('data') """ 每个ITEM courseName:"中国建设银行VIP" examId:"43d8625d21614cab9f6a2e323e0cd4db" id:"1686999432228376576" """ return data def query_paper_type_list(id): response = requests.get('https://wx.yiwenjy.cn/yunlian_pc/queryPaperTypeList', params={ 'courseId': id }, headers=common_headers).json() """ "id": "1", "paperTypeName": "章节练习", "icon": null, "version": null, "isSection": null, "hasSection": null """ return response.get('data') def get_li_nian_zhen_ti_list(id, paperTypeId): response = requests.get('https://wx.yiwenjy.cn/yunlian_pc/queryPaperList', params={ 'courseId': id, 'paperTypeId': paperTypeId }, headers=common_headers).json() """ 每个ITEM "id": "1703299201187844096", "paperName": "2022年银行招聘笔试《中国建设银行》试题", "onlineTime": "2023-09-17 00:00:00", "referenNumber": 139, "tryBuy": 1, "hasMake": 3, "mode": null """ return response.get('data') course = [ {'id': 'f753f9934c60427fadfba664229a8487', 'name': '2024年军队文职人员招聘《公共科目》题库'} ] for courseItem in course: # 创建科目的文件夹 if not os.path.exists(courseItem.get('name')): os.makedirs(courseItem.get('name')) product_course_info = get_product_course_info(courseItem.get('id')) for product_course in product_course_info: # 查询当前科目下的试卷类型列表 paper_type_list = query_paper_type_list(product_course.get('id')) for paper_type in paper_type_list: print(f"当前科目:{product_course.get('courseName')},当前试卷类型:{paper_type.get('paperTypeName')}") if paper_type.get('paperTypeName') == '章节练习': get_catalogue(courseItem.get("name"), product_course.get('id')) elif paper_type.get('paperTypeName') in ('历年真题', '考前点题', '模拟试卷', '预测试卷', '考前点题'): li_nian_zhen_ti_list = get_li_nian_zhen_ti_list(product_course.get('id'), paper_type.get('id')) for li_nian_zhen_ti in li_nian_zhen_ti_list: record_id = 1 wb = load_workbook(filename='sample.xlsx') ws = wb.active rows = ws.rows print(f"当前试卷ID:{li_nian_zhen_ti.get('id')},试卷名称:{li_nian_zhen_ti.get('paperName')}") get_exam_question(li_nian_zhen_ti.get('id')) wb.save( f'{courseItem.get("name")}/{courseItem.get("name")}-{li_nian_zhen_ti.get("paperName")}.xlsx') else: print("不支持的试卷类型:%s" % paper_type.get('paperTypeName')) exit()
2024年04月26日
142 阅读
0 评论
0 点赞