// ==UserScript== // @name Wenku Doc Downloader // @namespace http://tampermonkey.net/ // @version 0.9 // @description 下载“百度文库”文档,导出txt或pdf。“豆丁网”“爱问共享资料”(新浪文档)文档导出pdf。在文档页面最底部有蓝/绿色长方形按钮,说明脚本生效了,否则就没有生效。 // @author allenlv2690@gmail.com // @match https://wenku.baidu.com/view/* // @match https://www.docin.com/p-* // @match https://ishare.iask.sina.com.cn/f/* // @icon https://www.google.com/s2/favicons?domain=limestart.cn // @grant none // @license GPL-3.0-only // @create 2021-11-22 // @note 修复了纯图片文档报错的bug(程序里有个字符串写错了【笑哭】) // @note 以后打算更新对纯图片文档网页上直接导出PDF功能的支持(尽管还是更推荐使用【图片下载合并器】) // @downloadURL none // ==/UserScript== /* * 附属功能函数部分 */ function createAndDownloadFile(fileName, content) { // 创建并下载文件 var aTag = document.createElement('a'); var blob = new Blob([content]); aTag.download = fileName; aTag.href = URL.createObjectURL(blob); aTag.click(); URL.revokeObjectURL(blob); } function formatText(text){ // 用于纯文本文档的文本美化 var reg_exp_1 = new RegExp(" [(]?=[\u4e00-\u9fa5] [)]"); var reg_exp_2 = new RegExp("(?<=TEMP[\u4e00-\u9fa5]) "); var reg_exp_3 = new RegExp("(?<=[\u4e00-\u9fa5]) (?=[\u4e00-\u9fa5])"); var text_1 = text.replace(reg_exp_1, "TEMP"); var text_2 = text_1.replace(reg_exp_2, ""); var text_3 = text_2.replace("TEMP", ""); var text_final = text_3.replace(/ /g, " "); return text_final; } function formatText2(text) { // 用于图形文字混合型文档的文本美化 var reg_exp = new RegExp("[  ]{2,}"); var content_1 = text.replace(reg_exp, "\n"); var content_2 = content_1.replace(/[  ]\n/g, "\n"); var reg_exp_2 = new RegExp("\n[   ]*\n*\n"); var content_3 = content_2.replace(reg_exp_2, "\n"); var reg_exp_3 = new RegExp(" *\n * "); var content_4 = content_3.replace(reg_exp_3, "\n"); var content_5 = content_4.replace(/[  ]/g, " "); var final_content = content_5.replace(/[ \n]精选文档[ \n]/g).replace(/\n{2,}/g, "\n"); return final_content; } function detectType() { // 获取文档类型名称 var doc_title_wrap = document.getElementsByClassName("doc-title-wrap")[0]; var file_type = doc_title_wrap.children[0].className; var pdf, doc, ppt, excel, type; // 判断文档类型 if (file_type.search("word") !== -1) { type = "word"; } else if (file_type.search("ppt") !== -1) { type = "ppt"; } else if (file_type.search("excel") !== -1) { type = "excel"; } else if (file_type.search("pdf") !== -1) { type = "pdf"; } else if (file_type.search("txt" !== -1)) { type = "txt"; } else { type = file_type; } // 分别尝试获取相应元素列表,若列表长度为0则不存在相应元素,否则存在 var pic_nums = document.getElementsByClassName("reader-pic-item").length; var word_nums = document.getElementsByClassName("reader-word-layer").length; var ppt_img_nums = document.getElementsByClassName("ppt-image-wrap").length; // 判断文档类型、文字和图片的数量状况 if (type === "word" && !word_nums && pic_nums) { // doc: 纯图片 return "doc-only-pic"; } else if (type === "word" && word_nums > 2 && pic_nums <= 1) { // doc: 纯文字 return "doc-only-word"; } else if (type === "word" && pic_nums > 2 && word_nums > 2) { // doc: 图形、文字混合 return "doc-pic-word"; } else if (type === "pdf" && pic_nums > 2 && word_nums === 1) { // pdf: 带有一行文字标题,之后都是图形 return "pdf-pic-title"; } else if (type === "pdf" && !word_nums && pic_nums) { // pdf: 纯图形 return "pdf-only-pic"; } else if (type === "pdf" && !pic_nums && word_nums > 1) { // pdf: 纯文字 return "pdf-only-word"; } else if (type === "pdf" && word_nums > 2 && pic_nums > 1) { // pdf: 图形、文字混合 return "pdf-pic-word"; } else if ((type === "ppt" && ppt_img_nums > 2) || (type === "pdf" && !word_nums && !pic_nums && ppt_img_nums)) { // ppt: 包含至少3页内容 / 纯ppt图形页面构成 return "ppt"; } else if (type === "excel" && pic_nums && word_nums > 2) { // excel: 包含可选中文字 return "excel-only-word"; } else if (type === "excel" && pic_nums && !word_nums) { // excel: 纯图形 return "excel-only-pic"; } else if (type === "txt") { // txt: 纯文字 return "txt"; } else { return {"源文档类型": type, "图形数量": pic_nums, "文字块数量": word_nums, "ppt纯图形页面数量": ppt_img_nums}; } } function tryToRemoveElement(element) { // try移除元素 try { element.remove(); } catch(e) { console.log(); } } function tryToRemoveSameElem(elem_list_box) { // try移除[元素列表1, 元素列表2, ...]的元素 for (var elem_list of elem_list_box) { if (!elem_list) { continue; } for (var elem of elem_list) { try { elem.remove(); } catch(e) { console.log(); } } } } function centerDoc(class_name, default_offset) { // 使文档居中 var doc_main = document.getElementsByClassName(class_name)[0]; var offset = window.prompt("请输入偏移百分位:", default_offset); // 如果输入的数字不在 0-59 内,提醒用户重新设置 if (offset.length === 1 && offset.search(/[0-9]/) !== -1) { doc_main.style.marginLeft = offset + "%"; return true; } else if (offset.length === 2 && offset.search(/[1-5][0-9]/) !== -1) { doc_main.style.marginLeft = offset + "%"; return true } else { alert("请输入一个正整数,范围在0至59之间,用来使文档居中\n(不同文档偏移量不同,所以需要手动调整)"); return false; } } /* * 主要功能函数部分 */ var docin_counter = 0; function printPageDocin() { // # 清理并打印豆丁网的文档页 // ## 选择指针光标 try {document.getElementById("j_select").click();} catch(e) {console.log();} // ## 移除页面上无关的元素 // ### 移除单个元素 var doc_head = document.getElementsByClassName("doc_header_mod")[0]; var head_wrapper = document.getElementsByClassName("head_wrapper")[0]; var aside = document.getElementsByClassName("aside")[0]; var slide = document.getElementById("docinShareSlider"); var no_more = document.getElementsByClassName("no_more_mod")[0]; var like_too = document.getElementById("likeToo"); var tools_bottom_bar = document.getElementsByClassName("tools_bottom_bar")[0]; var page_crubms = document.getElementsByClassName("page_crubms")[0]; var bottom_ad = document.getElementById("jControlDivRecomm"); var back_to_top = document.getElementsByClassName("backToTop")[0]; // ### 执行移除 var elem_list = [doc_head, head_wrapper, aside, slide, no_more, like_too, tools_bottom_bar, page_crubms, bottom_ad, back_to_top ]; for (var elem of elem_list) { tryToRemoveElement(elem); } // ### 移除全部同类元素 var ad_box = document.getElementsByClassName("adBox"); tryToRemoveSameElem([ad_box]); // 使文档居中 var doc = document.getElementsByClassName("main")[0]; doc.style.marginLeft = "6%"; // 隐藏按钮,然后打印页面 var btn_2 = document.getElementsByClassName("save-doc-btn")[0]; btn_2.style.display = "none"; // 打印结束,显示按钮 alert("如果预览时有空白页,请取消打印\n请上下滚动页面,确保每页内容都加载完成\n如果文档中有广告,请取消打印,再点一次按钮\n最多不超过2次,应该没有广告了"); window.print(); btn_2.style.removeProperty("display"); } function printPageiShare() { // # 清理并打印爱问共享资料的文档页 // ## 移除页面上无关的元素 // ### 移除单个元素 var topbanner = document.getElementsByClassName("detail-topbanner")[0]; var header = document.getElementsByClassName("new-detail-header")[0]; var fixright = document.getElementById("fix-right"); var redpacket = document.getElementsByClassName("loginRedPacket-dialog")[0]; var fixedrightfull = document.getElementsByClassName("fixed-right-full")[0]; var footer = document.getElementsByClassName("website-footer")[0]; var guess = document.getElementsByClassName("guess-you-like-warpper")[0]; var detailtopbox = document.getElementsByClassName("detail-top-box")[0]; var fullscreen = document.getElementsByClassName("reader-fullScreen")[0]; var endhint = document.getElementsByClassName("endof-trial-reading")[0]; var crumb_arrow; try {crumb_arrow = document.getElementsByClassName("crumb-arrow")[0].parentElement;} catch(e) {console.log();} var copyright = document.getElementsByClassName("copyright-container")[0]; var state_btn = document.getElementsByClassName("state-bottom")[0]; // ### 执行移除 var elem_list = [topbanner, header, fixright, redpacket, fixedrightfull, footer, guess, detailtopbox, fullscreen, endhint, crumb_arrow, copyright, state_btn ]; for (var elem of elem_list) { tryToRemoveElement(elem); } // ### 移除全部同类元素 var adv_container = document.getElementsByClassName("adv-container"); tryToRemoveSameElem([adv_container]); // 使文档居中 alert("建议使用:\n偏移量:18\n缩放:默认\n如果预览中有广告,就取消打印\n再点一次按钮,预览中应该就没有广告了"); if (!centerDoc("doc-main", "18")) { return; // 如果输入非法,终止函数调用 } // 隐藏按钮,然后打印页面 var btn_2 = document.getElementsByClassName("save-doc-btn")[0]; btn_2.style.display = "none"; window.print(); // 打印结束,显示按钮 btn_2.style.removeProperty("display"); } function printPageBaidu() { // # 清理并打百度文库的文档页 // ## 移除无关页面元素 // ### 要移除的单个元素 var header_wrapper = document.getElementsByClassName("header-wrapper")[0]; var right_wrapper = document.getElementById("right-wrapper-id"); var reader_topbar = document.getElementsByClassName("reader-topbar")[0]; var end_fold_page = document.getElementsByClassName("try-end-fold-page")[0]; for (var elem_1 of [header_wrapper, right_wrapper, reader_topbar, end_fold_page]) { tryToRemoveElement(elem_1); } // ### 移除全部同类元素 var lazy_load_list = document.getElementsByClassName("lazy-load"); var no_full_screen_list = document.getElementsByClassName("no-full-screen"); var ads = document.getElementsByClassName("hx-warp"); tryToRemoveSameElem([lazy_load_list, ads, no_full_screen_list]); // 使文档居中 alert("建议使用:\n偏移量:0\n缩放:118%"); if (!centerDoc("left-wrapper", "0")) { return; // 如果输入非法,退出函数调用 } // 隐藏按钮,然后打印页面 var section = document.getElementsByClassName("btns_section")[0]; section.style.display = "none"; window.print(); // 打印结束,显示按钮 section.style.removeProperty("display"); } function createSaveHtmlBtn() { // 创建 下载html 按钮 var btn_3 = document.createElement("button"); // 样式设定 btn_3.setAttribute("class", "save-html-btn"); btn_3.style.height = "25px"; btn_3.style.width = "15%"; btn_3.style.marginLeft = "0.2%"; btn_3.style.backgroundColor = "orange"; btn_3.style.border = "none"; btn_3.textContent = "导出pdf(实验性)"; btn_3.style.fontWeight = "bold"; btn_3.style.borderRadius = "10%"; // 绑定事件,添加到页面上 btn_3.onclick = printPageBaidu; var section = document.getElementsByClassName("btns_section")[0]; section.appendChild(btn_3); } function readAlliShare() { var red_btn = document.getElementsByClassName("red-color")[0]; var red_text = red_btn.textContent; // 如果可以展开,则展开 if (red_text.search("点击可继续阅读") !== -1) { red_btn.click(); } // 否则启动按钮2,准备清理页面然后打印为PDF else { var hint = "文档已经完全展开,可以导出"; alert(hint); // 准备调整按钮,先获取按钮 var init_btn = document.getElementsByClassName("init-btn")[0]; var save_doc_btn = document.getElementsByClassName("save-doc-btn")[0]; // 调整按钮显示状况 save_doc_btn.style.removeProperty("display"); init_btn.style.display = "none"; } } function readAll() { var read_all_btn = document.getElementsByClassName("read-all")[0]; // 如果存在“继续阅读”的按钮 if (read_all_btn) { // 点击“继续阅读”按钮 read_all_btn.click(); } else{ var hint = "文档已经完全展开,可以导出"; alert(hint); try { // 判断文档类型 var category = detectType(); } catch(e) { alert("未知/特殊文档类型,例如学术文献,暂不支持下载\n也可与作者反馈或联系:\nallenlv2690@gmail.com"); return undefined; } // 准备调整按钮,先获取按钮 var init_btn = document.getElementsByClassName("init-btn")[0]; var save_doc_btn = document.getElementsByClassName("save-doc-btn")[0]; // 根据文档类型判断是否要增加“导出pdf”橙色按钮 if (category === "doc-only-word" || category === "doc-pic-word" || category === "pdf-only-word" || category === "pdf-pic-word" || category === "excel-only-word") { // 非纯图片文档可以使用html转pdf的功能(excel不行) save_doc_btn.style.width = "34.8%"; createSaveHtmlBtn(); } // 根据文档类型判断是否要更换绿色按钮的文字 else if (category === "doc-only-pic" || category === "pdf-pic-title" || category === "ppt" || category === "pdf-only-pic" || category === "excel-only-pic"){ save_doc_btn.textContent = "导出全部图片链接"; } // 调整按钮显示状况 save_doc_btn.style.removeProperty("display"); init_btn.style.display = "none"; } } function savePDFData() { // 存储pdf型data(假定是内容是pic) // alert("Function savePDFData was called."); var pic_urls = document.getElementsByClassName("reader-pic-item"); var text_list = []; // 去掉前缀 var reg_exp_1 = new RegExp(": ?url[(]"); // 去掉后缀 var reg_exp_2 = new RegExp("[)]; ?background-position"); for (var i = 0; i < pic_urls.length; i++){ var whole_text = pic_urls[i].getAttribute("style"); var de_pretext = whole_text.split(reg_exp_1)[1]; var url = de_pretext.split(reg_exp_2)[0]; text_list.push(url); } text_list[0] = text_list[0].replace(/"/g, ""); var content = text_list.join("\n"); // 启动下载 createAndDownloadFile("urls.csv", content); } function saveDocData() { // 存储doc型data(内容是text) // alert("Function saveDocData was called."); // 获取文本 var text_elements = document.getElementsByClassName("reader-word-layer"); var texts = []; for (var elem of text_elements){ texts.push(elem.textContent); } // 美化后导出文本 var origin_content = texts.join(""); var content = formatText(origin_content); createAndDownloadFile("纯文本文档.txt", content); } function savePPTData() { // 存储ppt型data(内容是pic) // alert("Function savePPTData was called."); var pic_elements = document.getElementsByClassName("ppt-image-wrap"); var pic_urls = []; for (var elem of pic_elements) { var pic_obj = elem.children[0]; var url = pic_obj.src; pic_urls.push(url); } var content = pic_urls.join("\n"); // 启动下载 createAndDownloadFile("urls.csv", content); } function saveExcelData() { // 1. 拿到表格 var table_pic = document.getElementsByClassName("reader-pic-item")[0]; var url = table_pic.style.getPropertyValue("background-image"); // 获取图片地址 var pure_url = url.slice(5, -2); // 2. 拿到表格内文字信息 var text_elems = document.getElementsByClassName("reader-word-layer"); var text_list = []; for (var elem of text_elems) { text_list.push(elem.textContent); } var _text = text_list.join("\n"); // 替换奇怪的空格 var text = _text.replace(/ /g, " "); // 3. 合并至一个字符串,然后导出 var head = "表格图形链接如下(复制到浏览器中打开):"; var content = head + "\n\n" + pure_url + "\n\n" + text; createAndDownloadFile("图片地址和表格内容.txt", content); } function saveDocAndPicData() { // 对于文字和图形混合型的data只能存储其中的纯文字 // alert("Function saveDocAndPicData was called."); // 获取文本 var text_elements = document.getElementsByClassName("reader-word-layer"); var texts = []; for (var elem of text_elements){ texts.push(elem.textContent); } var origin_content = texts.join(""); // 美化后导出文本 var content = formatText2(origin_content); createAndDownloadFile("纯文本文档.txt", content); } function saveTxtData() { // 存储纯文本到本地 var text_elements = document.getElementsByClassName("p-txt"); var texts = []; for (var elem of text_elements){ texts.push(elem.textContent); } var content = texts.join(""); createAndDownloadFile("纯文本文档.txt", content); } function saveData() { // 存储文档数据到本地 var category = detectType(); if (category === "doc-only-pic" || category === "pdf-pic-title" || category === "pdf-only-pic" || category === "excel-only-pic"){ // 对于纯图形文档,都用【图片下载合并器】来处理 savePDFData(); } else if (category === "doc-only-word" || category === "doc-pic-word" || category === "pdf-only-word" || category === "pdf-pic-word") { // 对于包含大量文字、且非表格的文档,直接提出纯文本 saveDocData(); } else if (category === "ppt") { // ppt按类似于纯图文档的方法处理 savePPTData(); } else if (category === "excel-only-word") { // excel仅保存其中的纯文字 saveExcelData(); } else if (category === "txt") { // txt直接保存 saveTxtData(); } else { var info = []; for (var key in category){ info.push(key + " : " + category[key]); } alert("未知处理类型,请反馈或联系作者:\nallenlv2690@gmail.com\n" + info.join("\n")); } } function create2btns() { // 创建两个初始按钮:展开文档、存储文档 // 创建脚本启动按钮1、2 var btn_1 = document.createElement("button"); var btn_2 = document.createElement("button"); // 设定按钮1、2样式 btn_1.setAttribute("class", "init-btn"); btn_1.style.height = "25px"; btn_1.style.width = "50%"; btn_1.style.marginLeft = "25%"; btn_1.style.border = "none"; btn_1.style.backgroundColor = "blue"; btn_1.style.color = "white"; btn_1.style.fontWeight = "bold"; btn_1.textContent = "展开文档"; btn_2.setAttribute("class", "save-doc-btn"); btn_2.style.height = "25px"; btn_2.style.width = "50%"; btn_2.style.marginLeft = "25%"; btn_2.style.backgroundColor = "green"; btn_2.style.border = "none"; btn_2.style.display = "none"; btn_2.style.color = "white"; btn_2.style.fontWeight = "bold"; // 添加按钮元素到页面 var section = document.createElement("section"); section.setAttribute("class", "btns_section"); section.appendChild(btn_1); section.appendChild(btn_2); document.body.appendChild(section); // 返回元素引用 return [btn_1, btn_2] } /* * 主函数部分 */ function baiduWenku() { // 创建脚本启动按钮1、2 var [btn_1, btn_2] = create2btns(); btn_2.textContent = "导出纯文本"; // 绑定主函数 btn_1.onclick = readAll; btn_2.onclick = saveData; // 解除打印限制 var style = document.createElement("style"); style.innerHTML = `@media print { body{ display:block; } }`; style.type="text/css"; document.getElementsByTagName("head")[0].appendChild(style); } function docin() { // 创建脚本启动按钮 var [btn_1, btn_2] = create2btns(); btn_1.remove(); btn_2.textContent = "打印页面到PDF"; btn_2.style.removeProperty("display"); // 绑定主函数 btn_2.onclick = printPageDocin; } function ishare() { // 创建脚本启动按钮1、2 var [btn_1, btn_2] = create2btns(); btn_2.textContent = "打印页面到PDF"; // 绑定主函数 btn_1.onclick = readAlliShare; btn_2.onclick = printPageiShare; // 移除底部下载条 var detailfixed = document.getElementsByClassName("detail-fixed")[0]; detailfixed.remove(); } function main() { var host = window.location.host; if (host === "wenku.baidu.com") { baiduWenku(); } else if (host === "www.docin.com") { docin(); } else if (host === "ishare.iask.sina.com.cn") { ishare(); } else { console.log("匹配到了无效网页"); } } main();