python3解析cnmarc格式
cnmarc是中国图书馆的机读格式,这里是超简单讲解(https://blog.csdn.net/chaishen10000/article/details/79245055)
这里是asp解析的,不过有点小问题就是没有判断中文的标点符号(https://blog.csdn.net/dyllove98/article/details/8717830)
下面封装成了django,不过还是一看就懂了(先提前把ISO文件改为txt来解析的)
#main.py
#!/usr/bin/env python
# coding=utf-8
#
# Author: archer
# File: main.py
# Desc: Process all json data files, extract the BookInfo
# Produced By BR
from django.http import HttpResponse
import json
from app.c.util import ParseJsonCNMARC
import glob
import pymysql
import re
def removeBom(file):
'''移除UTF-8文件的BOM字节'''
BOM = b'\xef\xbb\xbf'
existBom = lambda s: True if s == BOM else False
f = open(file, 'rb')
if existBom(f.read(3)):
fbody = f.read()
# f.close()
with open(file, 'wb') as f:
f.write(fbody)
# 判断是否是中文
def is_chinese(uchar):
"""判断一个unicode是否是汉字"""
bd = [",","、","。","?","!","(",")",":",";"]
if uchar in bd:
return 1
if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
return 1
else:
return 0
# print(is_chinese(","))
# exit()
#计算数据的真实长度
def GetCharLength(MarcText,_Len):
i = 0
j = 0
while i < _Len and j < len(MarcText) - 1:
# sarr = System.Text.Encoding.Default.GetBytes(MarcText.Substring(j, 1));
isChinese = is_chinese(MarcText[j])
if isChinese:
i += 2
else:
i += 1
j += 1
return j
#先转为json再提取元数据
def toJson(dd):
# print(dd)
# 数据字段区起始地址
_ConPos = int(str(dd[12:17]).lstrip("0"))
#字段个数
_Count = int((_ConPos - 24 - 1) / 12)
#地址目次区字符串
_Address = dd[24:_ConPos - 1]
#数据字段区字符串
_Contents = dd[_ConPos:]
#字段编号
_Keys = []
#字段值
_Values = []
for i in range(_Count):
_Keys.append(_Address[i*12:i*12 + 3])
#字段对应的数据区长度
_Len = int(_Address[i * 12 + 3:i * 12 + 3 + 4])
#字段对应的数据区开始位置
_Pos = int(_Address[i * 12 + 7:i * 12 + 7 + 5])
# print(_Address[i * 12:i * 12 + 12])
# print(_Address)
# print(_Pos)
# print(_Len)
#获取在中英文混排MARC文本中的实际位置
_NewPos = GetCharLength(_Contents, _Pos)
#获取实际截取长度
_NewLen = GetCharLength(_Contents, _Pos + _Len) - _NewPos
# print(_NewPos)
# print(_NewLen)
#
# print(_Pos)
# print(_Contents)
# print(_Contents[_NewPos:_NewPos + _NewLen])
# print("\n")
_Values.append(_Contents[_NewPos:_NewPos+_NewLen].replace("\x1ff",'b').replace("\x1f","").replace("\x1e","").replace("\t","").strip())
return dict(zip(_Keys, _Values))
def resultData():
data = {
"bookNumber":"",
"title": "",
"authors": "",
"index": "",
"isbn": "",
"publisher": "",
"publishPlace": "",
"publishTime": "",
"price": 0.0,
"YSBMY": "",
"entryTime": "2004-05-24",
"context": "",
"pages": "",
"format": "",
"classifyNumber": "",
"summary": "",
"origin": "",
}
return data
def connectMysql():
conn = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="", db="test",
charset="utf8")
cur = conn.cursor() # 获取对应的操作游标
return conn,cur
def insertIntoMysql(data):
conn, cur = connectMysql()
sql2 = "insert into {}" \
"(bookNumber,title,authors,`index`,isbn,publisher,publishPlace,publishTime,price,YSBMY,entryTime,context,pages,format,classifyNumber,summary,origin) " \
"values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)".format("books")
try:
cur.executemany(sql2, data) # 执行多条插入,单条插入的语句是:execute
conn.commit()
except:
# print(time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime()) + " 出错了,UT号为: " + str(dd[0]))
raise # 抛一个错误,用于调试
def getPrice(price):
# print(price)
px = re.compile("([0-9|\.]*$)")
data = px.findall(price)
return float(data[0])
def formData(BookInfo):
data = resultData()
try:
data["bookNumber"] = ""
except:
pass
try:
data["title"] = BookInfo["title"]
except:
pass
try:
data["authors"] = BookInfo["authors"][0]
except:
pass
try:
data["index"] = BookInfo["clc_sort_num"]
except:
pass
try:
data["isbn"] = BookInfo["isbn"]
except:
pass
try:
data["publisher"] = BookInfo["publisher_name"]
except:
pass
try:
data["publishPlace"] = BookInfo["publish_place"]
except:
pass
try:
data["publishTime"] = BookInfo["print_date"]
except:
pass
try:
data["price"] = getPrice(BookInfo["price"])
except:
pass
try:
data["YSBMY"] = ""
except:
pass
try:
data["entryTime"] = "2004-05-24"
except:
pass
try:
data["pages"] = BookInfo["pagesize"]
except:
pass
try:
data["format"] = BookInfo["size"] + 'cm'
except:
pass
try:
data["classifyNumber"] = ""
except:
pass
try:
data["summary"] = BookInfo["summary"]
except:
pass
try:
data["origin"] = ""
except:
pass
try:
data["context"] = data["title"] + data["authors"] + data["isbn"] + data["index"] + data["publisher"] + data["publishPlace"] + data["publishTime"][0:4] + "CNY{}".format(data["price"]) + data["summary"]
except:
pass
return data
def getJsonData(path):
# 去除txt开头的BOM字节
removeBom(path)
fopen = open(path, 'r',encoding="gbk")
lines = fopen.readlines()
allData = []
for line in lines:
allData.append(toJson(line))
return allData
def index(request):
# return HttpResponse(["1111"], content_type="application/json")
# for jsonfile in glob.glob("/Users/mac/Desktop/所有文件/Ranger-master/data" + "/*.json"):
# print(jsonfile)
path = request.POST.get('path', 'testt.txt')
# path = "testt.txt"
recs = getJsonData(path)
allData = []
for rec in recs:
BookInfo = ParseJsonCNMARC(rec)
ddd = formData(BookInfo)
allData.append(ddd)
return HttpResponse(json.dumps(allData), content_type="application/json")
# print(allData)
# insertIntoMysql(allData)
# exit()
# print(collection.insert_one(BookInfo).inserted_id, BookInfo['main_heading'])
# index()util.py
#!/usr/bin/env python
# coding=utf-8
import time
import datetime
# 读取文件到一个变量
# 接受文件路径,返回string文本
def ReadFile(filename):
with open(filename, 'r') as f:
return f.read()
# 解析一条JSON格式的CNMARC记录
# 接受dict(json)格式的一条记录,返回对应的书籍信息
def ParseJsonCNMARC(dictjson):
BookInfo = {}
# rganization_name
if '801' in dictjson:
if 'b' in dictjson['801']:
start = dictjson['801'].index('b') + 1
if 'c' in dictjson['801']:
end = dictjson['801'].index('c')
else:
end = -1
BookInfo['rganization_name'] = dictjson['801'][start:end]
else:
BookInfo['rganization_name'] = None
else:
BookInfo['rganization_name'] = None
# country_code
# if '801' in dictjson:
# if 'a' in dictjson['801']:
# start = dictjson['801'].index('a') + 1
# end = start + 2
# BookInfo['country_code'] = dictjson['801'][start:end]
# else:
# BookInfo['country_code'] = None
if '102' in dictjson:
if 'a' in dictjson['102']:
start = dictjson['102'].index('a') + 1
if 'b' in dictjson['102']:
end = dictjson['102'].index('b')
else:
end = len(dictjson['102'])
BookInfo['country_code'] = dictjson['102'][start:end]
else:
BookInfo['country_code'] = None
else:
BookInfo['country_code'] = None
# book_responsible
if '701' in dictjson:
if '4' in dictjson['701']:
start = dictjson['701'].index('4') + 1
end = -1
BookInfo['book_responsible'] = dictjson['701'][start:end]
else:
BookInfo['book_responsible'] = None
else:
BookInfo['book_responsible'] = None
# primary_responsible
if '200' in dictjson:
if 'f' in dictjson['200']:
start = dictjson['200'].index('f') + 1
if 'g' in dictjson['200']:
end = dictjson['200'].index('g')
else:
end = len(dictjson['200'])
if end < start:
end = len(dictjson['200'])
BookInfo['primary_responsible'] = dictjson['200'][start:end]
else:
BookInfo['primary_responsible'] = None
else:
BookInfo['primary_responsible'] = None
# other_responsible
# print dictjson['200']
if '200' in dictjson:
if 'f' in dictjson['200']:
if 'g' in dictjson['200']:
start = dictjson['200'].rindex('g')
if start < dictjson['200'].rindex('f'):
BookInfo['other_responsible'] = None
else:
BookInfo['other_responsible'] = dictjson['200'][start:dictjson['200'].rindex('f')]
else:
BookInfo['other_responsible'] = None
else:
BookInfo['other_responsible'] = None
# if 'g' in dictjson['200']:
# start = dictjson['200'].index('g') + 1
# if start > dictjson['200'].index('f'):
# BookInfo['other_responsible'] = dictjson['200'][start:]
# else:
# BookInfo['other_responsible'] = None
# else:
# BookInfo['other_responsible'] = None
else:
BookInfo['other_responsible'] = None
# clc_sort_num
if '690' in dictjson:
if 'a' in dictjson['690']:
start = dictjson['690'].index('a') + 1
try:
end = dictjson['690'].index('v5')
except:
end = len(dictjson['690'])
BookInfo['clc_sort_num'] = dictjson['690'][start:end]
else:
BookInfo['clc_sort_num'] = None
else:
BookInfo['clc_sort_num'] = None
# publisher_name
if '210' in dictjson:
if 'c' in dictjson['210']:
start = dictjson['210'].index('c') + 1
if 'd' in dictjson['210']:
end = dictjson['210'].index('d')
else:
end = -1
BookInfo['publisher_name'] = dictjson['210'][start:end]
else:
BookInfo['publisher_name'] = None
else:
BookInfo['publisher_name'] = None
# 发布地址, publish_place
if '210' in dictjson:
if 'a' in dictjson['210']:
start = dictjson['210'].index('a') + 1
if 'c' in dictjson['210']:
end = dictjson['210'].index('c')
else:
end = -1
BookInfo['publish_place'] = dictjson['210'][start:end]
else:
BookInfo['publish_place'] = None
else:
BookInfo['publish_place'] = None
# 发布时间, publisher_date
if '210' in dictjson:
if 'd' in dictjson['210']:
start = dictjson['210'].index('d') + 1
BookInfo['publisher_date'] = dictjson['210'][start:]
else:
BookInfo['publisher_date'] = None
else:
BookInfo['publisher_date'] = None
# binding
if '010' in dictjson:
if 'b' in dictjson['010']:
start = dictjson['010'].index('b') + 1
if 'd' in dictjson['010']:
end = dictjson['010'].index('d')
BookInfo['binding'] = dictjson['010'][start:end]
else:
BookInfo['binding'] = dictjson['010'][start:]
else:
BookInfo['binding'] = None
else:
BookInfo['binding'] = None
# 书籍类型, length_style
if '200' in dictjson:
if 'b' in dictjson['200']:
start = dictjson['200'].rindex('b') + 1
end = start + 2
BookInfo['length_style'] = dictjson['200'][start:end]
else:
BookInfo['length_style'] = None
else:
BookInfo['length_style'] = None
# price
if '010' in dictjson:
if 'd' in dictjson['010']:
start = dictjson['010'].index('d') + 1
BookInfo['price'] = dictjson['010'][start:]
else:
BookInfo['price'] = None
else:
BookInfo['price'] = None
# summary
if '330' in dictjson:
if 'a' in dictjson['330']:
start = dictjson['330'].index('a') + 1
else:
start = 0
BookInfo['summary'] = dictjson['330'][start:]
else:
BookInfo['summary'] = None
# isbn
if '010' in dictjson:
if 'a' in dictjson['010']:
start = dictjson['010'].index('a') + 1
end = start + 17
if 'b' in dictjson['010']:
end = dictjson['010'].index('b')
elif 'd' in dictjson['010']:
end = dictjson['010'].index('d')
else:
end = -1
BookInfo['isbn'] = dictjson['010'][start:end]
else:
BookInfo['isbn'] = None
else:
BookInfo['isbn'] = None
# 书籍尺寸, size
if '215' in dictjson:
if 'd' in dictjson['215']:
start = dictjson['215'].index('d') + 1
if 'cm' in dictjson['215']:
end = dictjson['215'].index('cm')
else:
end = -1
BookInfo['size'] = dictjson['215'][start:end]
else:
BookInfo['size'] = None
else:
BookInfo['size'] = None
# 书籍, size 厘米(cm)
# if '215' in dictjson:
# if 'd' in dictjson['215']:
# start = dictjson['215'].index('d') + 1
# if 'cm' in dictjson['215']:
# end = dictjson['215'].index('cm')
# else:
# end = -1
# BookInfo['size'] = dictjson['215'][start:end]
# else:
# BookInfo['size'] = None
# else:
# BookInfo['size'] = None
# pagesize
if '215' in dictjson:
if 'a' in dictjson['215']:
dictjson['215'] = dictjson['215'].strip('cm')
start = dictjson['215'].index('a') + 1
if 'c' in dictjson['215']:
end = dictjson['215'].index('c')
elif 'd' in dictjson['215']:
end = dictjson['215'].index('d')
elif 'e' in dictjson['215']:
end = dictjson['215'].index('e')
else:
end = -1
BookInfo['pagesize'] = dictjson['215'][start:end]
else:
BookInfo['pagesize'] = None
else:
BookInfo['pagesize'] = None
# print_date
if '100' in dictjson:
if 'a' in dictjson['100']:
start = dictjson['100'].index('a') + 1
# end = dictjson['100'].index('d')
end = start + 8
BookInfo['print_date'] = dictjson['100'][start:end]
else:
BookInfo['print_date'] = None
else:
BookInfo['print_date'] = None
# title
if '200' in dictjson:
if 'a' in dictjson['200']:
start = dictjson['200'].index('a') + 1
end = dictjson['200'].index('b')
title = dictjson['200'][start:end]
if '9' in title:
end = title.index('9')
BookInfo['title'] = title[0:end]
else:
BookInfo['title'] = None
else:
BookInfo['title'] = None
# translators[]
# authors[]
BookInfo['authors'] = []
if '701' in dictjson:
if 'a' in dictjson['701']:
start = dictjson['701'].index('a') + 1
if '9' in dictjson['701']:
end = dictjson['701'].index('9')
else:
end = -1
BookInfo['authors'].append(dictjson['701'][start:end])
# else:
# BookInfo['authors'] = None
if '702' in dictjson:
if 'a' in dictjson['702']:
start = dictjson['702'].index('a') + 1
if '9' in dictjson['702']:
end = dictjson['702'].index('9')
else:
end = -1
BookInfo['authors'].append(dictjson['702'][start:end])
# if len(BookInfo['authors']) == 0:
# print dictjson
# exit(2)
if BookInfo['authors']:
# print BookInfo['primary_responsible']
author = BookInfo['primary_responsible']
BookInfo['authors'].append(author)
# main_heading
if '606' in dictjson:
if 'a' in dictjson['606']:
start = dictjson['606'].index('a') + 1
if 'x' in dictjson['606']:
end = dictjson['606'].index('x')
elif 'y' in dictjson['606']:
end = dictjson['606'].index('y')
elif 'j' in dictjson['606']:
end = dictjson['606'].index('j')
elif 'z' in dictjson['606']:
end = dictjson['606'].index('z')
else:
end = len(dictjson['606'])
main_heading = dictjson['606'][start:end]
if 'y' in main_heading:
end = main_heading.index('y')
else:
end = len(main_heading)
BookInfo['main_heading'] = main_heading[0:end]
else:
BookInfo['main_heading'] = None
else:
BookInfo['main_heading'] = None
# print BookInfo['main_heading']
# exit(3)
# time.sleep(1)
# tags[]
# BookInfo['tags'] = []
if '606' in dictjson:
dictjson['606'] = dictjson['606'][2:]
# replace a, x, y, z, j
dictjson['606'] = dictjson['606'].replace('a', '#')
dictjson['606'] = dictjson['606'].replace('x', '#')
dictjson['606'] = dictjson['606'].replace('y', '#')
dictjson['606'] = dictjson['606'].replace('z', '#')
dictjson['606'] = dictjson['606'].replace('j', '#')
BookInfo['tags'] = dictjson['606'].split('#')[1:]
# year_sub
if '606' in dictjson:
if 'z' in dictjson['606']:
start = dictjson['606'].index('z') + 1
BookInfo['year_sub'] = dictjson['606'][start:]
else:
BookInfo['year_sub'] = None
# area_sub
if '606' in dictjson:
if 'y' in dictjson['606']:
start = dictjson['606'].index('y') + 1
if 'z' in dictjson['606']:
end = dictjson['606'].index('z')
else:
end = -1
BookInfo['year_sub'] = dictjson['606'][start:end]
else:
BookInfo['area_sub'] = None
# yopic_sub
if '606' in dictjson:
if 'x' in dictjson['606']:
start = dictjson['606'].index('x') + 1
if 'y' in dictjson['606']:
end = dictjson['606'].index('y')
else:
end = -1
BookInfo['yopic_sub'] = dictjson['606'][start:end]
else:
BookInfo['yopic_sub'] = None
# n_series_title
if '225' in dictjson:
if 'a' in dictjson['225']:
start = dictjson['225'].index('a') + 1
if 'i' in dictjson['225']:
end = dictjson['225'].index('i')
elif 'f' in dictjson['225']:
end = dictjson['225'].index('f')
else:
end = -1
BookInfo['n_series_title'] = dictjson['225'][start:end]
else:
BookInfo['n_series_title'] = None
# updatetime
BookInfo['updatetime'] = datetime.datetime.utcnow()
# createtime
BookInfo['createtime'] = BookInfo['updatetime']
# updateuserid
BookInfo['updateuserid'] = 'admin'
# createuserid
BookInfo['createuserid'] = 'admin'
# version
BookInfo['version'] = float(round(time.time() * 1000))
# __v
BookInfo['__v'] = 0
return BookInfo