做科研过程中,下载了大量pdf文件,混乱堆积在一个目录,怎么整理文件名更有意义?
用python写了一个小程序,读取DOI,然后自己重命名
当然也可以把它集成到右键菜单中,更方便。
# -*- coding: utf-8 -*-
"""
Created on Sat Nov 28 17:07:52 2020
代码已经修正可以运行
@author: Prof. Huigang Zhang, Chinese Academy of Sciences
"""
import os
import re
import jieba.analyse
from PyPDF4 import PdfFileReader, utils
#from shutil import copy, copy2
import sys
import importlib
import bibtexparser
import difflib
importlib.reload(sys)
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal,LTTextBox, LTTextLine, LAParams
#from pdfminer.pdfpage import PDFTextExtractionNotAllowed
#from doi2bib.crossref import get_bib_from_doi
from doi2bib.crossref import get_bib
des_dir = []
src_dir = []
num = 0
maxyear=2023
minyear=1900
src_file =[]
des_file =[]
cpd_file =[]
nan_char =['\\', '*', '?', '\"', ':', '/', '<', '>','(',')','{','}','$','=','\n','\r','\f','#','&','^','@','!','%']
kick_word=['the','of','after','before','and','or','so','same','in','on','below',\
'from','for','this','at','has','be','have','all','was','by','also',\
'are','is','these','those','some','a','an','one','to','between','into',\
'sub','textendash','less','greater','been','as','with']
kick_format=['$\less$','$\greater$','$less$','$greater$','_less_','_greater_','_sub_']
ordinary=['microsoft','static','paper','article','science','magazine']
chinesetitle=False
def string_similar(s1,s2):
return difflib.SequenceMatcher(None,s1,s2).quick_ratio()
def removeRedundant(paper_title): #检查文件名中不能有格式字符,删除它们
paper_title=paper_title.lower()
for index in range(len(kick_format)):
if paper_title.find(kick_format[index]) != -1:
paper_title = paper_title.replace(kick_format[index], '')
paper_title = paper_title.replace('textendash', '-')
for index in range(len(kick_word)):
if paper_title.find(' '+kick_word[index]+' ') != -1:
paper_title = paper_title.replace(' '+kick_word[index]+' ', ' ')
return paper_title
def IsOrdinary(paper_title): #检查文件名中不能有格式字符,删除它们
if len(paper_title)<2: return True
elif len(re.split(r'\W+',paper_title))<2: return True
paper_title=paper_title.lower()
for index in range(len(ordinary)):
if paper_title.find(ordinary[index]) != -1:
return True
return False
def is_contains_chinese(strs):
for _char in strs:
if '\u4e00' <= _char <= '\u9fa5':
return True
return False
def list_all_files(rootdir):
import os
_files = []
list = os.listdir(rootdir) #列出文件夹下所有的目录与文件
for i in range(0,len(list)):
path = os.path.join(rootdir,list[i])
if os.path.isdir(path):
_files.extend(list_all_files(path))
if os.path.isfile(path):
_files.append(path)
return _files
def checkSAMEname(newname):
for index in range(len(cpd_file)):
if cpd_file[index] == newname:
return True
return False
def removeNAN(paper_title): #检查文件名中不能有格式字符,删除它们
for index in range(len(nan_char)):
if paper_title.find(nan_char[index]) != -1:
paper_title = paper_title.replace(nan_char[index], '_')
return paper_title
def doi2name(doi): #如果知道DOI重新命名,一次找一个
found, bib = get_bib(doi)
bi = bibtexparser.loads(bib)
filename=bi.entries[0]["year"]+'_'+bi.entries[0]["title"]
#print(bi.entries[0]["title"])
return filename
def findDOIfromString(str): #从字符串中找到DOI,分两种情况来找
print('Running findDOIfromString function'.rjust(100,'-'))
sub=re.search(r"DOI:(.{5,50})",str,re.I)
st=1
en=1
doi=[]
if sub!=None:
substr=sub.group(1)
#substr=substr[0:50]
#print(substr)
#print(sub.group())
for index in range(len(substr)):
if substr[index]==' ':
continue
else:
st=index
break
for index in range(st,len(substr)):
if substr[index]==' ' or substr[index]==',':
en=index
break
if en==1: #找到结束也没找终止符,表示都是doi
en=len(substr)
if en-st>10:
doi=substr[st:en]
print('找到了DOI:格式'.rjust(100,'-'),doi)
return doi #如果从doi:格式中,找到了doi就结束
#如果上面代码没找到,尝试下面的doi.org
sub=re.search(r"DOI.org/(.{5,50})",str,re.I)
st=1
en=1
doi=[]
if sub!=None:
substr=sub.group(1)
substr=substr[0:50]
#print(substr)
#print(sub.group())
for index in range(len(substr)):
if substr[index]==' ':
continue
else:
st=index
break
for index in range(st,len(substr)):
if substr[index]==' ' or substr[index]==',':
en=index
break
if en==1: #找到结束也没找终止符,表示都是doi
en=len(substr)
if en-st>10:
doi=substr[st:en]
print('找到了doi.org/格式'.rjust(100,'-'),doi)
else:
print('两种方式都没找到DOI'.rjust(100,'-'))
return doi
def chooseone(paper_title,doi):
found, bib = get_bib(doi)
if found:
bi = bibtexparser.loads(bib)
print(bi.entries[0]["title"])
new_title = re.sub('[\f\n\r\t\v{}]','',bi.entries[0]["title"])
new_title=removeRedundant(removeNAN(new_title) )
year=bi.entries[0]["year"]
print('[[[[[',year,new_title,doi,']]]]]')
if len(paper_title)>10 and string_similar(paper_title,new_title)>0.7:
return year, paper_title.title()
else:
return year, new_title.title()
else:
year=[]
return year, paper_title.title()
def getAllfromDOI(doi):
found, bib = get_bib(doi)
if found:
bi = bibtexparser.loads(bib)
print('网上搜到了 Title=',bi.entries[0]["title"])
new_title = re.sub('[\f\n\r\t\v{}]','',bi.entries[0]["title"])
new_title = re.sub('[_\W]{2,}','',new_title)
new_title=removeRedundant(removeNAN(new_title) ).strip()
journal = re.sub('[\f\n\r\t\v{}]','',bi.entries[0]["journal"])
journal=removeRedundant(removeNAN(journal)).strip() #.rstrip()
if len(journal.split(' '))!=1:
abstr=[word[0] for word in journal.split(" ")] #得到缩写的journal
journal=''.join(abstr).upper()
else: #针对只有一个单词的期刊nature, science,joule
journal=journal.upper()
print('网上搜到了 JOURNAL=',journal+"\n")
year=bi.entries[0]["year"]
new_title = journal+' '+ new_title.title()
print('网上搜...提bib信息...成功.............')
print('[',year,new_title,'DOI:',doi,']')
return year, new_title
else:
year=[]
paper_title=[]
print('网上搜...提取bib信息...不成功!!!!!!')
return year, paper_title
def IsAGoodTitle(paper_title):
#False 意味着要重新分析,找到一个好的title
if chinesetitle:
return True
if not paper_title:
return False
if paper_title.upper()=='NONE':
return False
if IsOrdinary(paper_title):
return False
if (len(paper_title) > 1.2*len(re.sub('[\d]','',paper_title))):
return False #数字太多的title
else: return True
def IsFoundBetterThanFname(paper_title,fname):
if chinesetitle: return False
if not paper_title: return False
if paper_title.upper()=='NONE': return False
if len(fname)>len(paper_title): return False
else: return True
# 对本地保存的pdf文件进行读取和写入到txt文件当中
# 定义解析函数
def pdftotxt(path): # 注意path是已经open打开的文件句柄
# 创建一个文档分析器
parser = PDFParser(path)
# 创建一个PDF文档对象存储文档结构
document =PDFDocument(parser)
# 判断文件是否允许文本提取
str_text=''
if not document.is_extractable:
print('pdftotxt.......is not extractable')
pass #raise PDFTextExtractionNotAllowed
else:
# 创建一个PDF资源管理器对象来存储资源
resmag =PDFResourceManager()
# 设定参数进行分析
laparams =LAParams()
# 创建一个PDF设备对象
# device=PDFDevice(resmag)
device =PDFPageAggregator(resmag,laparams=laparams)
# 创建一个PDF解释器对象
interpreter = PDFPageInterpreter(resmag, device)
# 处理每一页
#print(document.get_outlines())
# 获取page列表
#print(PDFPage.get_pages(document))
for pageNumber, page in enumerate(PDFPage.create_pages(document)):
if pageNumber<=1 or len(str_text)<50: #第一页经常有问题,多提取几页的文本分析
interpreter.process_page(page)
#print(type(page)) # 接受该页面的LTPage对象
layout =device.get_result()
for y in layout:
if isinstance(y,LTTextBox) or isinstance(y, LTTextLine):
#with open("%s"%(new_name),'a',encoding="utf-8") as f:
# f.write(y.get_text()+"\n")
#print(y.get_text())
str_text=str_text+y.get_text()
device.close()
return str_text
def ContainYear(str):
if str[0:4].isdigit():
year=int(str[0:4])
if year<maxyear and year>minyear:
return True
else: return False
else:return False
def getFilename2nd(fname, f): #分析命名程序
first_page=[]
paper_title=[] #必须定义空,判断找打还是没找到?
year=[]
page_num=0
chinesetitle=is_contains_chinese(fname) # print(not chinesetitle)
try:
f.seek(os.SEEK_SET)
fname = removeRedundant(re.sub(r'\W+',' ', fname))
#常规情况,一般下载的期刊文章的PDF属性中寻找命名规则
print('\n\033[1;33m(1)<----从pdf文件自己属性中得到如下信息---->\033[0m')
pdf_reader = PdfFileReader(f) # 打开并建立一个PDF文件对象
docInfo = pdf_reader.getDocumentInfo() # print(docInfo)
paper_title = docInfo.title # 获取PDF标题
paper_journal = docInfo.subject #journal
#paper_author =docInfo.author
page_num=pdf_reader.getNumPages()
paper_title = str(paper_title) # 标题字符化
paper_title = removeRedundant(removeNAN(paper_title)) # 对于'/'无法写入文件名的情况,将其用'_'代替
paper_journal = str(paper_journal)
print("NUMBER:[%s] TITLE=" % num, paper_title) # 显示处理到第几个文件
print("NUMBER:[%s] JOURNAL=" % num, paper_journal) # 显示处理到第几个文件
print('先从PDF文件属性的Journal中找找DOI.....')
doi=findDOIfromString(paper_journal)
if doi:
print('从PDF文件的属性的Journal中找到DOI:%s' % doi)
year, new_title = getAllfromDOI(doi)
if year and new_title:
paper_title = new_title
print('从journal找到的DOI,上doi.org网搜到了YEAR和TITLE')
else: print('虽然从journal找到了DOI,但是上网检索失败,没找到YEAR和TITLE')
else: print('没有从Journal中找到DOI')
if not year: #几率不大,但是还是可以尝试从journal中找找看
year=re.search(r'(19|20)[0-9]{2}',paper_journal) # print(year)
if year:
year=year.group(0)
print('合理吗?从PDF文件的属性Journal中找到的这个YEAR=', year)
else: print('没有从PDF文件的属性journal中找到YEAR')
if not year or IsOrdinary(paper_title): #pdf文件属性里面没找到year或者名字是庸俗的
print('没有从pdf文件属性中的到YEAR信息,或者名字很平庸,\n')
print('\033[1;33m(2)<-----开始分析第一页文本------>\033[0m')
print('Searching YEAR from the first_page')
f.seek(os.SEEK_SET)
first_page=str(pdftotxt(f)) #print(first_page)
first_page=re.sub('[\f\n\r\t\v]',' ',first_page) #使用正则去符号,之后都是用这个str字符串
print('开始从第一页文本中找DOI信息')
doi=findDOIfromString(first_page)
if doi:
print('找到了DOI,开始分析DOI,提取BIB信息')
year, new_title = getAllfromDOI(doi)
else:
print('第一页中DOI搜索不成功')
if year and new_title: #doi搜索成功用搜索得到的信息
paper_title = new_title
else: #doi online search没找到year,可能doi搜的不对,继续全文找四位数吧
print('提取bib信息不成功....分析第一页文本,看看能不能找到YEAR信息')
for m in re.finditer(r"\W(19|20)[0-9]{2}\W", first_page):
tempyear = m.group(0)[1:5] # use the last one
numyear=int(tempyear)
if numyear > maxyear: #判断找到的四位数是不是合理的Year
continue
elif numyear < minyear:
continue
else:
year=tempyear
print(year)
else:print('好吧,合理吗?第一页文本分析找到的这个 YEAR=', year)
#属性和DOI都不行,用最暴力办法,来凑名字
#下面针对特殊情况,实在找不到合适的名字,就从第一页文本中分析 !!
#中文文献跳过,不做JIEBA分析,程序会挂掉
print('\n\033[1;33m(1-2)PDF文本属性和DOI搜索尝试都已完成,找到下面结果,检查结果是否合理...\033[0m')
#print(year,paper_title)
if IsAGoodTitle(paper_title)==False: #现在的title不行,重新分析
print('PDF文本属性和DOI搜索尝试的TITLE=%s.....不合理' % paper_title)
print('paper_title is not informative: ', paper_title)
print('After removing NUMBERS')
print(re.sub('[\d]','',paper_title))
if not first_page:
f.seek(os.SEEK_SET)
first_page=str(pdftotxt(f))
first_page=re.sub('\W+',' ',first_page) #使用正则去符号,之后都是用这个str字符串
if first_page:
print('\n(3)<----用第一页文本做JIEBA分析---->看看能不能抽提出关键词,组合成TITLE')
tags=jieba.analyse.extract_tags(first_page, topK=10)
paper_title=' '.join(tags)
paper_title=removeRedundant(paper_title).title()
print('\033[1;34mUsing JIEBA................USING JIEBA..................\033[0m')
print("\033[1;34mNUMBER: [%s] JIEBA-TITLE=%s\033[0m" % (index, paper_title))
if paper_title.find('\d') != -1:
new_paper_title = paper_title.replace('\d', '')
paper_title = new_paper_title
else: print('没有用JIEBA,就找了看似合理的TITLE,因为文件名中有-----中文--or---有意义的英文')
if IsFoundBetterThanFname(paper_title,fname)==False: #没招了
paper_title = fname.title()
#paper_title=re.sub('\W','_',page_title)
except utils.PdfReadError as e:
print('\033[5;31;40m!!!!!!!!!!!!!!!!!!!!utils.PdfReadError...\033[0m')
print("异常代码...%s" % e)
paper_title = fname
print('异常了,就用原文件名')
except :
print("常规和Jieba找不到合适名字来命名。。。。。未知异常,用原来文件名")
paper_title = fname
# year和paper_title凑齐了
# 分析完成,最后组装文件名字
if (not year) and (not paper_title):
newname=fname+ '.pdf'
print('实在找不到合适的名字,就用原文件名')
elif (not year) and paper_title:
newname=paper_title+ '.pdf'
elif year and (not paper_title):
if ContainYear(fname)==False: newname=year + ' ' + fname + '.pdf'
else: newname=fname + '.pdf'
elif ContainYear(paper_title)==False:
newname=year + ' ' + paper_title+ '.pdf'
else: newname=paper_title+ '.pdf'
if page_num>100:
newname='[BOOK] '+ newname
if checkSAMEname(newname): #检查目标目录是不是已经有相同名字?如果有,只能用原来的名字了
newname = fname+'.pdf'
if year and ContainYear(fname)==False:
newname = year +' '+ newname
return newname
#------------------------------------------------------------------------------
# main program Starting from here
#------------------------------------------------------------------------------
if __name__=='__main__':
if len(sys.argv)!=2:
sys.exit('argv error!')
else:
print(sys.argv[1])
(src_dir,fname)=os.path.split(sys.argv[1])
des_dir = src_dir
if not os.path.exists(des_dir): # 如果没有目标文件夹,新建一个目标文件夹进行存储
os.makedirs(des_dir)
if os.path.exists(src_dir):
files = list_all_files(src_dir) # 获取源文件的目录地址
index=0
for file in files: # 对于目录下的每一个文件
(pathfile,ext)=os.path.splitext(file)
(path,fname)=os.path.split(pathfile)
index += 1
if ext != '.pdf' or fname[0]=='\.' or (fname[0]=='i' and fname[1]=='i'):
continue #跳过非pdf,特殊名字
print('\nNUMBER:[%s]************************Start to rename a PDF file*******************************' % index)
print('\n--Renaming this file now!\n---->' ,file)
f=open(file, 'rb')
if not f:
continue
newname = getFilename2nd(fname, f) # try a hard way to get filename
f.close()
print("\n\033[1;33m(***)分析结论:\033[0m")
print("\033[1;33mFILENAME=%s\033[0m" % newname)
cpd_file.append(newname) #there are same filename, save for comparison
bkname=os.path.join(path, newname)
try:
if file !=bkname:
os.rename(file, bkname)
src_file.append(file)
des_file.append(bkname)
num += 1 # print(year)
except IOError as e:
print("Unable to rename file. %s" % e)
except:
print("renaming 未知异常: ",sys.exc_info())
print('\nNUMBER: [%s]**********************End of renaming**************\n\n\n' % index)
else:
print("该路径下不存在所查找的目录!")
print('\n\nDone!!!---------Summary---------')
print("[%d] of [%d] files are renamed in total\n\n" % (num,len(files)))
for index in range(len(src_file)):
print(src_file[index])
(newpath,filename)=os.path.split(des_file[index])
print('----------------->',filename)
os.system("pause")