// ==UserScript== // @name Wenku Doc Downloader // @namespace http://tampermonkey.net/ // @version 0.5 // @description 下载“百度文库”“豆丁网”文档,仅支持导出为txt文档或图片型的pdf。 // @author allenlv2690@gmail.com // @match https://wenku.baidu.com/view/* // @match https://www.docin.com/p-* // @icon https://www.google.com/s2/favicons?domain=limestart.cn // @grant none // @license GPL-3.0-only // @create 2021-11-22 // @note 修复了“pdf元素数量>2,doc元素数量=1”时不能下载文档的bug // @note 特此感谢“chmdir”的反馈 // @note 【图片下载合并器】没有更新 // @downloadURL none // ==/UserScript== /* * 附属功能函数部分 */ function createAndDownloadFile(fileName, content) { // 创建并下载文件 var aTag = document.createElement('a'); var blob = new Blob([content]); aTag.download = fileName; aTag.href = URL.createObjectURL(blob); aTag.click(); URL.revokeObjectURL(blob); } function formatText(text){ // 用于纯文本文档的文本美化 var reg_exp_1 = new RegExp(" [(]?=[\u4e00-\u9fa5] [)]"); var reg_exp_2 = new RegExp("(?<=TEMP[\u4e00-\u9fa5]) "); var reg_exp_3 = new RegExp("(?<=[\u4e00-\u9fa5]) (?=[\u4e00-\u9fa5])"); var text_1 = text.replace(reg_exp_1, "TEMP"); var text_2 = text_1.replace(reg_exp_2, ""); var text_3 = text_2.replace("TEMP", ""); var text_final = text_3.replace(/ /g, " "); return text_final; } function formatText2(text) { // 用于图形文字混合型文档的文本美化 var reg_exp = new RegExp("[  ]{2,}"); var content_1 = text.replace(reg_exp, "\n"); var content_2 = content_1.replace(/[  ]\n/g, "\n"); var reg_exp_2 = new RegExp("\n[   ]*\n*\n"); var content_3 = content_2.replace(reg_exp_2, "\n"); var reg_exp_3 = new RegExp(" *\n * "); var content_4 = content_3.replace(reg_exp_3, "\n"); var content_5 = content_4.replace(/[  ]/g, " "); var final_content = content_5.replace(/[ \n]精选文档[ \n]/g).replace(/\n{2,}/g, "\n"); return final_content; } /* * 主要功能函数部分 */ function readAll() { var read_all_btn = document.getElementsByClassName("read-all")[0]; // 如果存在“继续阅读”的按钮 if (read_all_btn) { // 点击“继续阅读”按钮 read_all_btn.click(); } // 如果点击完之后仍旧存在该按钮,递归调用自身 // read_all_btn = document.getElementsByClassName("read-all")[0]; // if (read_all_btn) { // readAll(); // } else{ alert("文档已经完全展开,可以导出"); var init_btn = document.getElementsByClassName("init-btn")[0]; var save_doc_btn = document.getElementsByClassName("save-doc-btn")[0]; init_btn.style.display = "none"; save_doc_btn.style.removeProperty("display"); } } function savePDFData() { // 存储pdf型data(假定是内容是pic) alert("Function savePDFData was called."); var pic_urls = document.getElementsByClassName("reader-pic-item"); var text_list = []; // 去掉前缀 var reg_exp_1 = new RegExp(": ?url[(]"); // 去掉后缀 var reg_exp_2 = new RegExp("[)]; ?background-position"); for (var i = 0; i < pic_urls.length; i++){ var whole_text = pic_urls[i].getAttribute("style"); var de_pretext = whole_text.split(reg_exp_1)[1]; var url = de_pretext.split(reg_exp_2)[0]; text_list.push(url); } text_list[0] = text_list[0].replace(/"/g, ""); var content = text_list.join("\n"); createAndDownloadFile("urls.csv", content); } function saveDocData() { // 存储doc型data(内容是text) alert("Function saveDocData was called."); // 获取文本 var text_elements = document.getElementsByClassName("reader-word-layer"); var texts = []; for (var elem of text_elements){ texts.push(elem.textContent); } // 美化后导出文本 var origin_content = texts.join(""); var content = formatText(origin_content); createAndDownloadFile("纯文本文档.txt", content); } function savePPTData() { // 存储ppt型data(内容是pic) alert("Function savePPTData was called."); var pic_elements = document.getElementsByClassName("ppt-image-wrap"); var pic_urls = []; for (var elem of pic_elements) { var pic_obj = elem.children[0]; var url = pic_obj.src; pic_urls.push(url); } var content = pic_urls.join("\n"); createAndDownloadFile("urls.csv", content); } function saveExcelData() { // 1. 拿到表格 var table_pic = document.getElementsByClassName("reader-pic-item")[0]; var url = table_pic.style.getPropertyValue("background-image"); // 获取图片地址 var pure_url = url.slice(5, -2); // 2. 拿到表格内文字信息 var text_elems = document.getElementsByClassName("reader-word-layer"); var text_list = []; for (var elem of text_elems) { text_list.push(elem.textContent); } var _text = text_list.join("\n"); // 替换奇怪的空格 var text = _text.replace(/ /g, " "); // 3. 合并至一个字符串,然后导出 var head = "表格图形链接如下(复制到浏览器中打开):"; var content = head + "\n\n" + pure_url + "\n\n" + text; createAndDownloadFile("图片地址和表格内容.txt", content); } function saveDocAndPicData() { // 对于文字和图形混合型的data只能存储其中的纯文字 alert("Function saveDocAndPicData was called."); // 获取文本 var text_elements = document.getElementsByClassName("reader-word-layer"); var texts = []; for (var elem of text_elements){ texts.push(elem.textContent); } // 处理文本中的过长空格 var origin_content = texts.join(""); // 美化后导出文本 var content = formatText2(origin_content); createAndDownloadFile("纯文本文档.txt", content); } function savePdfWithTitleData() { // 这是带有文字标题行的pdf文档,将直接忽略其中的标题行 savePDFData(); } function detectType() { // 分别尝试获取相应元素列表,若列表长度为0则不存在相应元素,否则存在 var pdf = document.getElementsByClassName("reader-pic-item").length; var doc = document.getElementsByClassName("reader-word-layer").length; var ppt = document.getElementsByClassName("ppt-image-wrap").length; // 判断文档类别 if (pdf && !doc && !ppt) { return "pdf"; } else if (doc && !pdf && !ppt) { return "doc"; } else if (ppt && !pdf && !doc) { return "ppt"; } else if (pdf === 1 && doc > 1 && !ppt) { return "excel"; } else if (pdf > 2 && doc > 2 && !ppt) { return "docANDpic"; } else if (pdf > 2 && doc === 1 && !ppt) { return "pdfWithTitle" } else { return {"pdf元素数量": pdf, "doc元素数量": doc, "ppt元素数量": ppt}; } } function saveData() { // 存储文档数据到本地 var category = detectType(); if (category === "pdf"){ savePDFData(); } else if (category === "doc") { saveDocData(); } else if (category === "ppt") { savePPTData(); } else if (category === "excel") { saveExcelData(); } else if (category === "docANDpic") { saveDocAndPicData(); } else if (category === "pdfWithTitle") { savePdfWithTitleData(); } else { var info = []; for (var key in category){ info.push(key + " : " + category[key]); } alert("未知文档类型\n" + info.join("\n")); } } /* * 主函数部分 */ function baiduWenku() { // 创建脚本启动按钮1、2 var btn_1 = document.createElement("button"); var btn_2 = document.createElement("button"); // 设定按钮1、2样式 btn_1.setAttribute("class", "init-btn"); btn_1.style.height = "25px"; btn_1.style.width = "50%"; btn_1.style.marginLeft = "25%"; btn_1.style.backgroundColor = "blue"; btn_2.setAttribute("class", "save-doc-btn"); btn_2.style.height = "25px"; btn_2.style.width = "50%"; btn_2.style.marginLeft = "25%"; btn_2.style.backgroundColor = "green"; btn_2.style.display = "none"; // 绑定主函数 btn_1.addEventListener("click", readAll); btn_2.addEventListener("click", saveData); // 添加按钮元素到页面 document.body.appendChild(btn_1); document.body.appendChild(btn_2); // 确认主程序加载完毕 console.log("Program Loaded"); } function docin() { // 创建脚本启动按钮 var btn = document.createElement("button"); // 设定按钮1、2样式 btn.style.height = "25px"; btn.style.width = "50%"; btn.style.marginLeft = "25%"; btn.style.backgroundColor = "green"; // 绑定主函数 var printPage = function() {window.print();}; btn.addEventListener("click", printPage); // 添加按钮元素到页面 document.body.appendChild(btn); // 确认主程序加载完毕 console.log("Program Loaded"); } function main() { var host = window.location.host; if (host === "wenku.baidu.com") { baiduWenku(); } else if (host === "www.docin.com") { docin(); } else { console.log("匹配到了无效网页"); } } main();