google 网站营销,网站建设运营公司企业特色,外包app开发价格表,wordpress图片转内针对文献下载后的影响因子查询问题#xff0c;手动逐个查询效率较低#xff0c;而使用Zotero等工具配合插件操作又过于繁琐。为此#xff0c;我们开发了一个Python封装模块#xff0c;能够自动批量处理文件夹中的文献#xff0c;快速查询并生成分析报告。请注意#xff0…针对文献下载后的影响因子查询问题手动逐个查询效率较低而使用Zotero等工具配合插件操作又过于繁琐。为此我们开发了一个Python封装模块能够自动批量处理文件夹中的文献快速查询并生成分析报告。请注意使用本模块前需提前获取Easy Scholar的密钥。主要模块我们封装了一个类用于实现相应的功能。import time import re import requests import pandas as pd from pathlib import Path from urllib.parse import quote import fitz # PyMuPDF import matplotlib.pyplot as plt from docx import Document from docx.shared import Inches from docx.enum.text import WD_ALIGN_PARAGRAPH from concurrent.futures import ThreadPoolExecutor, as_completed from tqdm import tqdm # pip install tqdm class JournalLiteratureStatistics: 全流程科研文献分析工具 功能PDF解析 - DOI识别 - Crossref/EasyScholar查询 - 统计绘图 - Word/Excel报告 修复支持 Excel 非法字符清洗、修复 Word 图片路径报错、多线程加速 def __init__(self, papers_dir, secret_key, emailyour_emailexample.com, max_workers5): self.papers_dir Path(papers_dir) self.secret_key secret_key # Crossref Polite Pool headers self.headers { User-Agent: fLiteratureStats/1.0 (mailto:{email}) } self.max_workers max_workers if not self.papers_dir.exists(): raise FileNotFoundError(f目录不存在: {self.papers_dir}) # # 0️⃣ 辅助工具清洗 Excel 非法字符 # staticmethod def clean_text(val): 清洗 Excel (XML) 不支持的 ASCII 控制字符 if isinstance(val, str): # 移除 0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F return re.sub(r[\x00-\x08\x0B\x0C\x0E-\x1F], , val) return val # # 1️⃣ PDF → Title DOI (增强版) # def extract_pdf_metadata(self, pdf_path: Path) - dict: info {File: pdf_path.name, Title: None, DOI: None} try: with fitz.open(pdf_path) as doc: # 优先读取元数据 meta doc.metadata or {} info[Title] meta.get(title) # 读取文本用于正则匹配 text doc[0].get_text(text) if len(doc) 0 else # 增强型 DOI 正则 doi_pattern r\b(10\.\d{4,9}/[-._;()/:A-Z0-9])\b m re.search(doi_pattern, text, re.I) if m: info[DOI] m.group(1) # 标题保底策略 if not info[Title] or len(info[Title]) 5 or Untitled in info[Title]: lines [l.strip() for l in text.split(\n) if l.strip()] potential_titles [l for l in lines[:15] if 10 len(l) 200] if potential_titles: info[Title] max(potential_titles, keylen) except Exception as e: info[Error_PDF] str(e) return info # # 2️⃣ DOI → Crossref # def query_crossref(self, doi: str) - dict: if not doi: return {} url fhttps://api.crossref.org/works/{doi} try: r requests.get(url, headersself.headers, timeout15) r.raise_for_status() msg r.json().get(message, {}) return { Journal: msg.get(container-title, [None])[0], ISSN: ,.join(msg.get(ISSN, [])) if msg.get(ISSN) else None, Crossref_Title: msg.get(title, [None])[0] # 更准确的标题 } except Exception as e: return {Error_Crossref: str(e)} # # 3️⃣ easyScholar → IF / 分区 # def query_journal_rank(self, journal_name: str) - dict: if not journal_name: return {} url https://www.easyscholar.cc/open/getPublicationRank params { secretKey: self.secret_key, publicationName: quote(journal_name) } try: r requests.get(url, paramsparams, timeout15) r.raise_for_status() res r.json() if res.get(code) ! 200: return {Error_Rank: res.get(msg, Unknown EasyScholar Error)} official (res.get(data, {}) .get(officialRank, {}) .get(all, {})) if not official: return {Warning_Rank: No ranking data found} return { SCI_IF: official.get(sciif), SCI_IF_5yr: official.get(sciif5), SCI_JCR: official.get(sci), CAS_Upgrade: official.get(sciUp), CAS_Warning: official.get(sciwarn), } except Exception as e: return {Error_Rank: str(e)} # # 单任务流水线 # def process_single_paper(self, pdf_path): # Step 1: 提取 info self.extract_pdf_metadata(pdf_path) # Step 2: 联网查询 if info.get(DOI): # Crossref cr self.query_crossref(info[DOI]) info.update(cr) # 使用 Crossref 标题覆盖 PDF 提取的标题通常更规范 if cr.get(Crossref_Title): info[Title] cr[Crossref_Title] # EasyScholar if cr.get(Journal): rank self.query_journal_rank(cr.get(Journal)) info.update(rank) else: info[Error] Journal name missing from Crossref else: info[Error] DOI extraction failed return info # # 4️⃣ 批量分析 (多线程) # def batch_analyze(self) - pd.DataFrame: pdf_files list(self.papers_dir.glob(*.pdf)) records [] print(f 开始分析 {len(pdf_files)} 篇文献 (并发数: {self.max_workers})...) with ThreadPoolExecutor(max_workersself.max_workers) as executor: future_to_pdf {executor.submit(self.process_single_paper, pdf): pdf for pdf in pdf_files} for future in tqdm(as_completed(future_to_pdf), totallen(pdf_files), unitpaper): try: data future.result() records.append(data) except Exception as e: pdf future_to_pdf[future] records.append({File: pdf.name, Error: fCrash: {str(e)}}) return pd.DataFrame(records) # # 5️⃣ 统计汇总 # def summarize(self, df: pd.DataFrame) - dict: if df.empty: return {} return { Total papers: len(df), DOI detected: df[DOI].notna().sum(), Journals identified: df[Journal].notna().sum(), SCI Q1: (df[SCI_JCR] Q1).sum(), CAS 1st Quartile: (df[CAS_Upgrade] 1区).sum(), CAS Warning: df[CAS_Warning].notna().sum() } # # 6️⃣ 构建期刊表 # def build_journal_table(self, df: pd.DataFrame) - pd.DataFrame: if Journal not in df.columns or df.empty: return pd.DataFrame() df2 df.copy() df2[SCI_IF] pd.to_numeric(df2[SCI_IF], errorscoerce) grouped ( df2.groupby(Journal) .agg( Count(Journal, count), Mean_IF(SCI_IF, mean), JCR(SCI_JCR, first), CAS(CAS_Upgrade, first) ) .sort_values(Count, ascendingFalse) .reset_index() ) grouped[Mean_IF] grouped[Mean_IF].round(2) return grouped # # 7️⃣ 绘图 (美化版) # def plot_and_save_figures(self, df: pd.DataFrame, fig_dir): fig_dir Path(fig_dir) fig_dir.mkdir(parentsTrue, exist_okTrue) # 字体设置 (避免中文乱码优先使用 Arial 或系统默认无衬线) plt.rcParams.update({font.sans-serif: [Arial, DejaVu Sans, SimHei], font.size: 12}) # Fig 1: IF 分布 if SCI_IF in df.columns: if_series pd.to_numeric(df[SCI_IF], errorscoerce).dropna() if not if_series.empty: fig, ax plt.subplots(figsize(6, 4)) ax.hist(if_series, bins10, color#4c72b0, edgecolorblack, alpha0.8) ax.set_xlabel(SCI Impact Factor (IF)) ax.set_ylabel(Count) ax.set_title(Distribution of Impact Factors, pad12) ax.grid(axisy, linestyle--, alpha0.5) plt.tight_layout() fig.savefig(fig_dir / Fig1_IF_distribution.png, dpi300) plt.close(fig) # Fig 2: JCR 分区 if SCI_JCR in df.columns: jcr_counts df[SCI_JCR].value_counts().sort_index() if not jcr_counts.empty: fig, ax plt.subplots(figsize(5, 5)) colors [#5b9bd5, #ed7d31, #a5a5a5, #ffc000] wedges, texts, autotexts ax.pie( jcr_counts, labelsjcr_counts.index, autopct%1.1f%%, startangle90, colorscolors[:len(jcr_counts)] ) plt.setp(autotexts, size10, weightbold, colorwhite) ax.set_title(JCR Quartile Distribution, pad12) plt.tight_layout() fig.savefig(fig_dir / Fig2_JCR_quartile.png, dpi300) plt.close(fig) # # 8️⃣ 输出 Word (修复 Path 报错) # def export_word_report(self, df, table_df, fig_dir, out_docx): doc Document() doc.add_heading(Bibliometric Analysis Report, level0).alignment WD_ALIGN_PARAGRAPH.CENTER # 文本段落 s self.summarize(df) if s: doc.add_heading(1. Summary, level1) para ( fThis study analyzed {s[Total papers]} journal articles. fDOIs were extracted from {s[DOI detected]} files. fJCR Q1 papers: {s[SCI Q1]}. fCAS Tier 1 papers: {s[CAS 1st Quartile]}. ) doc.add_paragraph(para) # 表格 if not table_df.empty: doc.add_heading(2. Journal Distribution, level1) table doc.add_table(rowslen(table_df) 1, colslen(table_df.columns)) table.style Table Grid # 表头 for i, col in enumerate(table_df.columns): table.rows[0].cells[i].text str(col) table.rows[0].cells[i].paragraphs[0].runs[0].bold True # 内容 for i, row in table_df.iterrows(): for j, val in enumerate(row): table.rows[i 1].cells[j].text str(val) if pd.notnull(val) else - # 图片 doc.add_heading(3. Statistical Figures, level1) fig_dir Path(fig_dir) def add_fig(name, caption): path fig_dir / name # 关键修复使用 str(path) 避免 PosixPath seek 错误 if path.exists(): doc.add_picture(str(path), widthInches(5.0)) doc.paragraphs[-1].alignment WD_ALIGN_PARAGRAPH.CENTER doc.add_paragraph(caption).alignment WD_ALIGN_PARAGRAPH.CENTER add_fig(Fig1_IF_distribution.png, Figure 1. IF Distribution) add_fig(Fig2_JCR_quartile.png, Figure 2. JCR Quartile) doc.save(out_docx) print(f✅ Word 报告已保存: {out_docx})调用方式请将上面的模块保存为一个python文件文件名可以命名为literature_manipulation.py并将该模块与调用的python文件放在同一目录下。import literature_manipulation from pathlib import Path #%% # 配置区 PAPERS_DIR # PDF 文件夹路径 SECRET_KEY # easyScholar Key OUTPUT_DIR ./output # 输出结果路径 EMAIL youremailuni.edu # 用于 Crossref 验证 # 实例化 tool literature_manipulation.JournalLiteratureStatistics( papers_dirPAPERS_DIR, secret_keySECRET_KEY, emailEMAIL ) try: # 1. 批量分析 df_result tool.batch_analyze() # 2. 清洗数据修复 IllegalCharacterError # 使用 map (pandas 2.1) 或 applymap (旧版) try: df_result df_result.map(tool.clean_text) except AttributeError: df_result df_result.applymap(tool.clean_text) # 3. 导出 Excel Path(OUTPUT_DIR).mkdir(exist_okTrue) df_result.to_excel(f{OUTPUT_DIR}/Literature_Stats_High_value.xlsx, indexFalse) print(f✅ Excel 数据已保存) # 4. 生成图表和 Word df_journals tool.build_journal_table(df_result) tool.plot_and_save_figures(df_result, fig_dirOUTPUT_DIR) tool.export_word_report(df_result, df_journals, fig_dirOUTPUT_DIR, out_docxf{OUTPUT_DIR}/Report.docx) except Exception as e: print(f❌ 程序运行出错: {e}) import traceback traceback.print_exc()制定文件夹路径填入密钥填入礼貌性的访问邮件运行即可生成相关报告和Excel表格。