python3解析cnmarc格式
cnmarc是中国图书馆的机读格式,这里是超简单讲解(https://blog.csdn.net/chaishen10000/article/details/79245055)
这里是asp解析的,不过有点小问题就是没有判断中文的标点符号(https://blog.csdn.net/dyllove98/article/details/8717830)
下面封装成了django,不过还是一看就懂了(先提前把ISO文件改为txt来解析的)
#main.py
#!/usr/bin/env python # coding=utf-8 # # Author: archer # File: main.py # Desc: Process all json data files, extract the BookInfo # Produced By BR from django.http import HttpResponse import json from app.c.util import ParseJsonCNMARC import glob import pymysql import re def removeBom(file): '''移除UTF-8文件的BOM字节''' BOM = b'\xef\xbb\xbf' existBom = lambda s: True if s == BOM else False f = open(file, 'rb') if existBom(f.read(3)): fbody = f.read() # f.close() with open(file, 'wb') as f: f.write(fbody) # 判断是否是中文 def is_chinese(uchar): """判断一个unicode是否是汉字""" bd = [",","、","。","?","!","(",")",":",";"] if uchar in bd: return 1 if uchar >= u'\u4e00' and uchar <= u'\u9fa5': return 1 else: return 0 # print(is_chinese(",")) # exit() #计算数据的真实长度 def GetCharLength(MarcText,_Len): i = 0 j = 0 while i < _Len and j < len(MarcText) - 1: # sarr = System.Text.Encoding.Default.GetBytes(MarcText.Substring(j, 1)); isChinese = is_chinese(MarcText[j]) if isChinese: i += 2 else: i += 1 j += 1 return j #先转为json再提取元数据 def toJson(dd): # print(dd) # 数据字段区起始地址 _ConPos = int(str(dd[12:17]).lstrip("0")) #字段个数 _Count = int((_ConPos - 24 - 1) / 12) #地址目次区字符串 _Address = dd[24:_ConPos - 1] #数据字段区字符串 _Contents = dd[_ConPos:] #字段编号 _Keys = [] #字段值 _Values = [] for i in range(_Count): _Keys.append(_Address[i*12:i*12 + 3]) #字段对应的数据区长度 _Len = int(_Address[i * 12 + 3:i * 12 + 3 + 4]) #字段对应的数据区开始位置 _Pos = int(_Address[i * 12 + 7:i * 12 + 7 + 5]) # print(_Address[i * 12:i * 12 + 12]) # print(_Address) # print(_Pos) # print(_Len) #获取在中英文混排MARC文本中的实际位置 _NewPos = GetCharLength(_Contents, _Pos) #获取实际截取长度 _NewLen = GetCharLength(_Contents, _Pos + _Len) - _NewPos # print(_NewPos) # print(_NewLen) # # print(_Pos) # print(_Contents) # print(_Contents[_NewPos:_NewPos + _NewLen]) # print("\n") _Values.append(_Contents[_NewPos:_NewPos+_NewLen].replace("\x1ff",'b').replace("\x1f","").replace("\x1e","").replace("\t","").strip()) return dict(zip(_Keys, _Values)) def resultData(): data = { "bookNumber":"", "title": "", "authors": "", "index": "", "isbn": "", "publisher": "", "publishPlace": "", "publishTime": "", "price": 0.0, "YSBMY": "", "entryTime": "2004-05-24", "context": "", "pages": "", "format": "", "classifyNumber": "", "summary": "", "origin": "", } return data def connectMysql(): conn = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="", db="test", charset="utf8") cur = conn.cursor() # 获取对应的操作游标 return conn,cur def insertIntoMysql(data): conn, cur = connectMysql() sql2 = "insert into {}" \ "(bookNumber,title,authors,`index`,isbn,publisher,publishPlace,publishTime,price,YSBMY,entryTime,context,pages,format,classifyNumber,summary,origin) " \ "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)".format("books") try: cur.executemany(sql2, data) # 执行多条插入,单条插入的语句是:execute conn.commit() except: # print(time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime()) + " 出错了,UT号为: " + str(dd[0])) raise # 抛一个错误,用于调试 def getPrice(price): # print(price) px = re.compile("([0-9|\.]*$)") data = px.findall(price) return float(data[0]) def formData(BookInfo): data = resultData() try: data["bookNumber"] = "" except: pass try: data["title"] = BookInfo["title"] except: pass try: data["authors"] = BookInfo["authors"][0] except: pass try: data["index"] = BookInfo["clc_sort_num"] except: pass try: data["isbn"] = BookInfo["isbn"] except: pass try: data["publisher"] = BookInfo["publisher_name"] except: pass try: data["publishPlace"] = BookInfo["publish_place"] except: pass try: data["publishTime"] = BookInfo["print_date"] except: pass try: data["price"] = getPrice(BookInfo["price"]) except: pass try: data["YSBMY"] = "" except: pass try: data["entryTime"] = "2004-05-24" except: pass try: data["pages"] = BookInfo["pagesize"] except: pass try: data["format"] = BookInfo["size"] + 'cm' except: pass try: data["classifyNumber"] = "" except: pass try: data["summary"] = BookInfo["summary"] except: pass try: data["origin"] = "" except: pass try: data["context"] = data["title"] + data["authors"] + data["isbn"] + data["index"] + data["publisher"] + data["publishPlace"] + data["publishTime"][0:4] + "CNY{}".format(data["price"]) + data["summary"] except: pass return data def getJsonData(path): # 去除txt开头的BOM字节 removeBom(path) fopen = open(path, 'r',encoding="gbk") lines = fopen.readlines() allData = [] for line in lines: allData.append(toJson(line)) return allData def index(request): # return HttpResponse(["1111"], content_type="application/json") # for jsonfile in glob.glob("/Users/mac/Desktop/所有文件/Ranger-master/data" + "/*.json"): # print(jsonfile) path = request.POST.get('path', 'testt.txt') # path = "testt.txt" recs = getJsonData(path) allData = [] for rec in recs: BookInfo = ParseJsonCNMARC(rec) ddd = formData(BookInfo) allData.append(ddd) return HttpResponse(json.dumps(allData), content_type="application/json") # print(allData) # insertIntoMysql(allData) # exit() # print(collection.insert_one(BookInfo).inserted_id, BookInfo['main_heading']) # index()
util.py
#!/usr/bin/env python # coding=utf-8 import time import datetime # 读取文件到一个变量 # 接受文件路径,返回string文本 def ReadFile(filename): with open(filename, 'r') as f: return f.read() # 解析一条JSON格式的CNMARC记录 # 接受dict(json)格式的一条记录,返回对应的书籍信息 def ParseJsonCNMARC(dictjson): BookInfo = {} # rganization_name if '801' in dictjson: if 'b' in dictjson['801']: start = dictjson['801'].index('b') + 1 if 'c' in dictjson['801']: end = dictjson['801'].index('c') else: end = -1 BookInfo['rganization_name'] = dictjson['801'][start:end] else: BookInfo['rganization_name'] = None else: BookInfo['rganization_name'] = None # country_code # if '801' in dictjson: # if 'a' in dictjson['801']: # start = dictjson['801'].index('a') + 1 # end = start + 2 # BookInfo['country_code'] = dictjson['801'][start:end] # else: # BookInfo['country_code'] = None if '102' in dictjson: if 'a' in dictjson['102']: start = dictjson['102'].index('a') + 1 if 'b' in dictjson['102']: end = dictjson['102'].index('b') else: end = len(dictjson['102']) BookInfo['country_code'] = dictjson['102'][start:end] else: BookInfo['country_code'] = None else: BookInfo['country_code'] = None # book_responsible if '701' in dictjson: if '4' in dictjson['701']: start = dictjson['701'].index('4') + 1 end = -1 BookInfo['book_responsible'] = dictjson['701'][start:end] else: BookInfo['book_responsible'] = None else: BookInfo['book_responsible'] = None # primary_responsible if '200' in dictjson: if 'f' in dictjson['200']: start = dictjson['200'].index('f') + 1 if 'g' in dictjson['200']: end = dictjson['200'].index('g') else: end = len(dictjson['200']) if end < start: end = len(dictjson['200']) BookInfo['primary_responsible'] = dictjson['200'][start:end] else: BookInfo['primary_responsible'] = None else: BookInfo['primary_responsible'] = None # other_responsible # print dictjson['200'] if '200' in dictjson: if 'f' in dictjson['200']: if 'g' in dictjson['200']: start = dictjson['200'].rindex('g') if start < dictjson['200'].rindex('f'): BookInfo['other_responsible'] = None else: BookInfo['other_responsible'] = dictjson['200'][start:dictjson['200'].rindex('f')] else: BookInfo['other_responsible'] = None else: BookInfo['other_responsible'] = None # if 'g' in dictjson['200']: # start = dictjson['200'].index('g') + 1 # if start > dictjson['200'].index('f'): # BookInfo['other_responsible'] = dictjson['200'][start:] # else: # BookInfo['other_responsible'] = None # else: # BookInfo['other_responsible'] = None else: BookInfo['other_responsible'] = None # clc_sort_num if '690' in dictjson: if 'a' in dictjson['690']: start = dictjson['690'].index('a') + 1 try: end = dictjson['690'].index('v5') except: end = len(dictjson['690']) BookInfo['clc_sort_num'] = dictjson['690'][start:end] else: BookInfo['clc_sort_num'] = None else: BookInfo['clc_sort_num'] = None # publisher_name if '210' in dictjson: if 'c' in dictjson['210']: start = dictjson['210'].index('c') + 1 if 'd' in dictjson['210']: end = dictjson['210'].index('d') else: end = -1 BookInfo['publisher_name'] = dictjson['210'][start:end] else: BookInfo['publisher_name'] = None else: BookInfo['publisher_name'] = None # 发布地址, publish_place if '210' in dictjson: if 'a' in dictjson['210']: start = dictjson['210'].index('a') + 1 if 'c' in dictjson['210']: end = dictjson['210'].index('c') else: end = -1 BookInfo['publish_place'] = dictjson['210'][start:end] else: BookInfo['publish_place'] = None else: BookInfo['publish_place'] = None # 发布时间, publisher_date if '210' in dictjson: if 'd' in dictjson['210']: start = dictjson['210'].index('d') + 1 BookInfo['publisher_date'] = dictjson['210'][start:] else: BookInfo['publisher_date'] = None else: BookInfo['publisher_date'] = None # binding if '010' in dictjson: if 'b' in dictjson['010']: start = dictjson['010'].index('b') + 1 if 'd' in dictjson['010']: end = dictjson['010'].index('d') BookInfo['binding'] = dictjson['010'][start:end] else: BookInfo['binding'] = dictjson['010'][start:] else: BookInfo['binding'] = None else: BookInfo['binding'] = None # 书籍类型, length_style if '200' in dictjson: if 'b' in dictjson['200']: start = dictjson['200'].rindex('b') + 1 end = start + 2 BookInfo['length_style'] = dictjson['200'][start:end] else: BookInfo['length_style'] = None else: BookInfo['length_style'] = None # price if '010' in dictjson: if 'd' in dictjson['010']: start = dictjson['010'].index('d') + 1 BookInfo['price'] = dictjson['010'][start:] else: BookInfo['price'] = None else: BookInfo['price'] = None # summary if '330' in dictjson: if 'a' in dictjson['330']: start = dictjson['330'].index('a') + 1 else: start = 0 BookInfo['summary'] = dictjson['330'][start:] else: BookInfo['summary'] = None # isbn if '010' in dictjson: if 'a' in dictjson['010']: start = dictjson['010'].index('a') + 1 end = start + 17 if 'b' in dictjson['010']: end = dictjson['010'].index('b') elif 'd' in dictjson['010']: end = dictjson['010'].index('d') else: end = -1 BookInfo['isbn'] = dictjson['010'][start:end] else: BookInfo['isbn'] = None else: BookInfo['isbn'] = None # 书籍尺寸, size if '215' in dictjson: if 'd' in dictjson['215']: start = dictjson['215'].index('d') + 1 if 'cm' in dictjson['215']: end = dictjson['215'].index('cm') else: end = -1 BookInfo['size'] = dictjson['215'][start:end] else: BookInfo['size'] = None else: BookInfo['size'] = None # 书籍, size 厘米(cm) # if '215' in dictjson: # if 'd' in dictjson['215']: # start = dictjson['215'].index('d') + 1 # if 'cm' in dictjson['215']: # end = dictjson['215'].index('cm') # else: # end = -1 # BookInfo['size'] = dictjson['215'][start:end] # else: # BookInfo['size'] = None # else: # BookInfo['size'] = None # pagesize if '215' in dictjson: if 'a' in dictjson['215']: dictjson['215'] = dictjson['215'].strip('cm') start = dictjson['215'].index('a') + 1 if 'c' in dictjson['215']: end = dictjson['215'].index('c') elif 'd' in dictjson['215']: end = dictjson['215'].index('d') elif 'e' in dictjson['215']: end = dictjson['215'].index('e') else: end = -1 BookInfo['pagesize'] = dictjson['215'][start:end] else: BookInfo['pagesize'] = None else: BookInfo['pagesize'] = None # print_date if '100' in dictjson: if 'a' in dictjson['100']: start = dictjson['100'].index('a') + 1 # end = dictjson['100'].index('d') end = start + 8 BookInfo['print_date'] = dictjson['100'][start:end] else: BookInfo['print_date'] = None else: BookInfo['print_date'] = None # title if '200' in dictjson: if 'a' in dictjson['200']: start = dictjson['200'].index('a') + 1 end = dictjson['200'].index('b') title = dictjson['200'][start:end] if '9' in title: end = title.index('9') BookInfo['title'] = title[0:end] else: BookInfo['title'] = None else: BookInfo['title'] = None # translators[] # authors[] BookInfo['authors'] = [] if '701' in dictjson: if 'a' in dictjson['701']: start = dictjson['701'].index('a') + 1 if '9' in dictjson['701']: end = dictjson['701'].index('9') else: end = -1 BookInfo['authors'].append(dictjson['701'][start:end]) # else: # BookInfo['authors'] = None if '702' in dictjson: if 'a' in dictjson['702']: start = dictjson['702'].index('a') + 1 if '9' in dictjson['702']: end = dictjson['702'].index('9') else: end = -1 BookInfo['authors'].append(dictjson['702'][start:end]) # if len(BookInfo['authors']) == 0: # print dictjson # exit(2) if BookInfo['authors']: # print BookInfo['primary_responsible'] author = BookInfo['primary_responsible'] BookInfo['authors'].append(author) # main_heading if '606' in dictjson: if 'a' in dictjson['606']: start = dictjson['606'].index('a') + 1 if 'x' in dictjson['606']: end = dictjson['606'].index('x') elif 'y' in dictjson['606']: end = dictjson['606'].index('y') elif 'j' in dictjson['606']: end = dictjson['606'].index('j') elif 'z' in dictjson['606']: end = dictjson['606'].index('z') else: end = len(dictjson['606']) main_heading = dictjson['606'][start:end] if 'y' in main_heading: end = main_heading.index('y') else: end = len(main_heading) BookInfo['main_heading'] = main_heading[0:end] else: BookInfo['main_heading'] = None else: BookInfo['main_heading'] = None # print BookInfo['main_heading'] # exit(3) # time.sleep(1) # tags[] # BookInfo['tags'] = [] if '606' in dictjson: dictjson['606'] = dictjson['606'][2:] # replace a, x, y, z, j dictjson['606'] = dictjson['606'].replace('a', '#') dictjson['606'] = dictjson['606'].replace('x', '#') dictjson['606'] = dictjson['606'].replace('y', '#') dictjson['606'] = dictjson['606'].replace('z', '#') dictjson['606'] = dictjson['606'].replace('j', '#') BookInfo['tags'] = dictjson['606'].split('#')[1:] # year_sub if '606' in dictjson: if 'z' in dictjson['606']: start = dictjson['606'].index('z') + 1 BookInfo['year_sub'] = dictjson['606'][start:] else: BookInfo['year_sub'] = None # area_sub if '606' in dictjson: if 'y' in dictjson['606']: start = dictjson['606'].index('y') + 1 if 'z' in dictjson['606']: end = dictjson['606'].index('z') else: end = -1 BookInfo['year_sub'] = dictjson['606'][start:end] else: BookInfo['area_sub'] = None # yopic_sub if '606' in dictjson: if 'x' in dictjson['606']: start = dictjson['606'].index('x') + 1 if 'y' in dictjson['606']: end = dictjson['606'].index('y') else: end = -1 BookInfo['yopic_sub'] = dictjson['606'][start:end] else: BookInfo['yopic_sub'] = None # n_series_title if '225' in dictjson: if 'a' in dictjson['225']: start = dictjson['225'].index('a') + 1 if 'i' in dictjson['225']: end = dictjson['225'].index('i') elif 'f' in dictjson['225']: end = dictjson['225'].index('f') else: end = -1 BookInfo['n_series_title'] = dictjson['225'][start:end] else: BookInfo['n_series_title'] = None # updatetime BookInfo['updatetime'] = datetime.datetime.utcnow() # createtime BookInfo['createtime'] = BookInfo['updatetime'] # updateuserid BookInfo['updateuserid'] = 'admin' # createuserid BookInfo['createuserid'] = 'admin' # version BookInfo['version'] = float(round(time.time() * 1000)) # __v BookInfo['__v'] = 0 return BookInfo