php是最好的语言

python3解析cnmarc格式

cnmarc是中国图书馆的机读格式,这里是超简单讲解(https://blog.csdn.net/chaishen10000/article/details/79245055)

这里是asp解析的,不过有点小问题就是没有判断中文的标点符号(https://blog.csdn.net/dyllove98/article/details/8717830)

下面封装成了django,不过还是一看就懂了(先提前把ISO文件改为txt来解析的)


#main.py
#!/usr/bin/env python
# coding=utf-8
#
# Author: archer
# File: main.py
# Desc: Process all json data files, extract the BookInfo
# Produced By BR
from django.http import HttpResponse
import json
from app.c.util import ParseJsonCNMARC
import glob
import pymysql
import re

def removeBom(file):
    '''移除UTF-8文件的BOM字节'''
    BOM = b'\xef\xbb\xbf'
    existBom = lambda s: True if s == BOM else False

    f = open(file, 'rb')
    if existBom(f.read(3)):
        fbody = f.read()
        # f.close()
        with open(file, 'wb') as f:
            f.write(fbody)

# 判断是否是中文
def is_chinese(uchar):
    """判断一个unicode是否是汉字"""
    bd = [",","、","。","?","!","(",")",":",";"]
    if uchar in bd:
        return 1

    if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
        return 1
    else:
        return 0


# print(is_chinese(","))
# exit()
#计算数据的真实长度
def GetCharLength(MarcText,_Len):
    i = 0
    j = 0
    while i < _Len and j < len(MarcText) - 1:

        # sarr = System.Text.Encoding.Default.GetBytes(MarcText.Substring(j, 1));
        isChinese = is_chinese(MarcText[j])
        if isChinese:
            i += 2
        else:
            i += 1


        j += 1


    return j


#先转为json再提取元数据
def toJson(dd):
    # print(dd)

    # 数据字段区起始地址
    _ConPos = int(str(dd[12:17]).lstrip("0"))

    #字段个数
    _Count = int((_ConPos - 24 - 1) / 12)

    #地址目次区字符串
    _Address = dd[24:_ConPos - 1]
    #数据字段区字符串
    _Contents = dd[_ConPos:]

    #字段编号
    _Keys = []

    #字段值
    _Values = []

    for i in range(_Count):
        _Keys.append(_Address[i*12:i*12 + 3])


        #字段对应的数据区长度
        _Len = int(_Address[i * 12 + 3:i * 12 + 3 + 4])

        #字段对应的数据区开始位置
        _Pos = int(_Address[i * 12 + 7:i * 12 + 7 + 5])

        # print(_Address[i * 12:i * 12 + 12])
        # print(_Address)
        # print(_Pos)
        # print(_Len)
        #获取在中英文混排MARC文本中的实际位置
        _NewPos = GetCharLength(_Contents, _Pos)

        #获取实际截取长度
        _NewLen = GetCharLength(_Contents, _Pos + _Len) - _NewPos
        # print(_NewPos)
        # print(_NewLen)

        #
        # print(_Pos)
        # print(_Contents)
        # print(_Contents[_NewPos:_NewPos + _NewLen])
        # print("\n")

        _Values.append(_Contents[_NewPos:_NewPos+_NewLen].replace("\x1ff",'b').replace("\x1f","").replace("\x1e","").replace("\t","").strip())

    return dict(zip(_Keys, _Values))

def resultData():
    data = {
        "bookNumber":"",
        "title": "",
        "authors": "",
        "index": "",
        "isbn": "",
        "publisher": "",
        "publishPlace": "",
        "publishTime": "",
        "price": 0.0,
        "YSBMY": "",
        "entryTime": "2004-05-24",
        "context": "",
        "pages": "",
        "format": "",
        "classifyNumber": "",
        "summary": "",
        "origin": "",

    }
    return data

def connectMysql():
    conn = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="", db="test",
                           charset="utf8")
    cur = conn.cursor()  # 获取对应的操作游标
    return conn,cur

def insertIntoMysql(data):
    conn, cur = connectMysql()
    sql2 = "insert into {}" \
           "(bookNumber,title,authors,`index`,isbn,publisher,publishPlace,publishTime,price,YSBMY,entryTime,context,pages,format,classifyNumber,summary,origin) " \
           "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)".format("books")
    try:
        cur.executemany(sql2, data)  # 执行多条插入,单条插入的语句是:execute
        conn.commit()
    except:
        # print(time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime()) + " 出错了,UT号为: " + str(dd[0]))
        raise  # 抛一个错误,用于调试

def getPrice(price):
    # print(price)
    px = re.compile("([0-9|\.]*$)")
    data = px.findall(price)
    return float(data[0])

def formData(BookInfo):

    data = resultData()
    try:
        data["bookNumber"] = ""
    except:
        pass
    try:
        data["title"] = BookInfo["title"]
    except:
        pass
    try:
        data["authors"] = BookInfo["authors"][0]
    except:
        pass
    try:
        data["index"] = BookInfo["clc_sort_num"]
    except:
        pass
    try:
        data["isbn"] = BookInfo["isbn"]
    except:
        pass
    try:
        data["publisher"] = BookInfo["publisher_name"]
    except:
        pass
    try:
        data["publishPlace"] = BookInfo["publish_place"]
    except:
        pass
    try:
        data["publishTime"] = BookInfo["print_date"]
    except:
        pass
    try:
        data["price"] = getPrice(BookInfo["price"])
    except:
        pass
    try:
        data["YSBMY"] = ""
    except:
        pass
    try:
        data["entryTime"] = "2004-05-24"
    except:
        pass
    try:
        data["pages"] = BookInfo["pagesize"]
    except:
        pass
    try:
        data["format"] = BookInfo["size"] + 'cm'
    except:
        pass
    try:
        data["classifyNumber"] = ""
    except:
        pass
    try:
        data["summary"] = BookInfo["summary"]
    except:
        pass
    try:
        data["origin"] = ""
    except:
        pass

    try:
        data["context"] = data["title"] + data["authors"] + data["isbn"] + data["index"] + data["publisher"] + data["publishPlace"] + data["publishTime"][0:4] + "CNY{}".format(data["price"]) + data["summary"]
    except:
        pass

    return data

def getJsonData(path):

    # 去除txt开头的BOM字节
    removeBom(path)

    fopen = open(path, 'r',encoding="gbk")

    lines = fopen.readlines()

    allData = []
    for line in lines:
        allData.append(toJson(line))

    return allData

def index(request):
    # return HttpResponse(["1111"], content_type="application/json")
    # for jsonfile in glob.glob("/Users/mac/Desktop/所有文件/Ranger-master/data" + "/*.json"):
        # print(jsonfile)
    path = request.POST.get('path', 'testt.txt')
    # path = "testt.txt"
    recs = getJsonData(path)
    allData = []
    for rec in recs:
        BookInfo = ParseJsonCNMARC(rec)
        ddd = formData(BookInfo)
        allData.append(ddd)

    return HttpResponse(json.dumps(allData), content_type="application/json")
    # print(allData)
        # insertIntoMysql(allData)
            # exit()
        # print(collection.insert_one(BookInfo).inserted_id, BookInfo['main_heading'])
# index()

util.py

#!/usr/bin/env python
# coding=utf-8
import time
import datetime

# 读取文件到一个变量
# 接受文件路径,返回string文本
def ReadFile(filename):
    with open(filename, 'r') as f:
        return f.read()

# 解析一条JSON格式的CNMARC记录
# 接受dict(json)格式的一条记录,返回对应的书籍信息
def ParseJsonCNMARC(dictjson):
    BookInfo = {}

    # rganization_name
    if '801' in dictjson:
        if 'b' in dictjson['801']:
            start = dictjson['801'].index('b') + 1
            if 'c' in dictjson['801']:
                end = dictjson['801'].index('c')
            else:
                end = -1
            BookInfo['rganization_name'] = dictjson['801'][start:end]
        else:
            BookInfo['rganization_name'] = None
    else:
        BookInfo['rganization_name'] = None

    # country_code
    # if '801' in dictjson:
    #     if 'a' in dictjson['801']:
    #         start = dictjson['801'].index('a') + 1
    #         end = start + 2
    #         BookInfo['country_code'] = dictjson['801'][start:end]
    # else:
    #     BookInfo['country_code'] = None

    if '102' in dictjson:
        if 'a' in dictjson['102']:
            start = dictjson['102'].index('a') + 1
            if 'b' in dictjson['102']:
                end = dictjson['102'].index('b')
            else:
                end = len(dictjson['102'])
            BookInfo['country_code'] = dictjson['102'][start:end]
        else:
            BookInfo['country_code'] = None
    else:
        BookInfo['country_code'] = None

    # book_responsible
    if '701' in dictjson:
        if '4' in dictjson['701']:
            start = dictjson['701'].index('4') + 1
            end = -1
            BookInfo['book_responsible'] = dictjson['701'][start:end]
        else:
            BookInfo['book_responsible'] = None
    else:
        BookInfo['book_responsible'] = None

    # primary_responsible
    if '200' in dictjson:
        if 'f' in dictjson['200']:
            start = dictjson['200'].index('f') + 1
            if 'g' in dictjson['200']:
                end = dictjson['200'].index('g')
            else:
                end = len(dictjson['200'])
            if end < start:
                end = len(dictjson['200'])

            BookInfo['primary_responsible'] = dictjson['200'][start:end]
        else:
            BookInfo['primary_responsible'] = None
    else:
        BookInfo['primary_responsible'] = None

    # other_responsible
    # print dictjson['200']
    if '200' in dictjson:
        if 'f' in dictjson['200']:
            if 'g' in dictjson['200']:
                start = dictjson['200'].rindex('g')
                if start < dictjson['200'].rindex('f'):
                    BookInfo['other_responsible'] = None
                else:
                    BookInfo['other_responsible'] = dictjson['200'][start:dictjson['200'].rindex('f')]
            else:
                BookInfo['other_responsible'] = None
        else:
            BookInfo['other_responsible'] = None

        # if 'g' in dictjson['200']:
        #     start = dictjson['200'].index('g') + 1
        #     if start > dictjson['200'].index('f'):
        #         BookInfo['other_responsible'] = dictjson['200'][start:]
        #     else:
        #         BookInfo['other_responsible'] = None
        # else:
        #     BookInfo['other_responsible'] = None
    else:
        BookInfo['other_responsible'] = None


    # clc_sort_num
    if '690' in dictjson:
        if 'a' in dictjson['690']:
            start = dictjson['690'].index('a') + 1
            try:
                end = dictjson['690'].index('v5')
            except:
                end = len(dictjson['690'])
            BookInfo['clc_sort_num'] = dictjson['690'][start:end]
        else:
            BookInfo['clc_sort_num'] = None
    else:
        BookInfo['clc_sort_num'] = None

    # publisher_name
    if '210' in dictjson:
        if 'c' in dictjson['210']:
            start = dictjson['210'].index('c') + 1
            if 'd' in dictjson['210']:
                end = dictjson['210'].index('d')
            else:
                end = -1
            BookInfo['publisher_name'] = dictjson['210'][start:end]
        else:
            BookInfo['publisher_name'] = None
    else:
        BookInfo['publisher_name'] = None

    # 发布地址, publish_place
    if '210' in dictjson:
        if 'a' in dictjson['210']:
            start = dictjson['210'].index('a') + 1
            if 'c' in dictjson['210']:
                end = dictjson['210'].index('c')
            else:
                end = -1
            BookInfo['publish_place'] = dictjson['210'][start:end]
        else:
            BookInfo['publish_place'] = None
    else:
        BookInfo['publish_place'] = None

    # 发布时间, publisher_date
    if '210' in dictjson:
        if 'd' in dictjson['210']:
            start = dictjson['210'].index('d') + 1
            BookInfo['publisher_date'] = dictjson['210'][start:]
        else:
            BookInfo['publisher_date'] = None
    else:
        BookInfo['publisher_date'] = None

    # binding
    if '010' in dictjson:
        if 'b' in dictjson['010']:
            start = dictjson['010'].index('b') + 1
            if 'd' in dictjson['010']:
                end = dictjson['010'].index('d')
                BookInfo['binding'] = dictjson['010'][start:end]
            else:
                BookInfo['binding'] = dictjson['010'][start:]
        else:
            BookInfo['binding'] = None
    else:
        BookInfo['binding'] = None

    # 书籍类型, length_style
    if '200' in dictjson:
        if 'b' in dictjson['200']:
            start = dictjson['200'].rindex('b') + 1
            end = start + 2
            BookInfo['length_style'] = dictjson['200'][start:end]
        else:
            BookInfo['length_style'] = None
    else:
        BookInfo['length_style'] = None

    # price
    if '010' in dictjson:
        if 'd' in dictjson['010']:
            start = dictjson['010'].index('d') + 1
            BookInfo['price'] = dictjson['010'][start:]
        else:
            BookInfo['price'] = None
    else:
        BookInfo['price'] = None

    # summary
    if '330' in dictjson:
        if 'a' in dictjson['330']:
            start = dictjson['330'].index('a') + 1
        else:
            start = 0
        BookInfo['summary'] = dictjson['330'][start:]
    else:
        BookInfo['summary'] = None

    # isbn
    if '010' in dictjson:
        if 'a' in dictjson['010']:
            start = dictjson['010'].index('a') + 1
            end = start + 17
            if 'b' in dictjson['010']:
                end = dictjson['010'].index('b')
            elif 'd' in dictjson['010']:
                end = dictjson['010'].index('d')
            else:
                end = -1
            BookInfo['isbn'] = dictjson['010'][start:end]
        else:
            BookInfo['isbn'] = None
    else:
        BookInfo['isbn'] = None

    # 书籍尺寸, size
    if '215' in dictjson:
        if 'd' in dictjson['215']:
            start = dictjson['215'].index('d') + 1
            if 'cm' in dictjson['215']:
                end = dictjson['215'].index('cm')
            else:
                end = -1
            BookInfo['size'] = dictjson['215'][start:end]
        else:
            BookInfo['size'] = None
    else:
        BookInfo['size'] = None

    # 书籍, size 厘米(cm)
    # if '215' in dictjson:
    #     if 'd' in dictjson['215']:
    #         start = dictjson['215'].index('d') + 1
    #         if 'cm' in dictjson['215']:
    #             end = dictjson['215'].index('cm')
    #         else:
    #             end = -1
    #         BookInfo['size'] = dictjson['215'][start:end]
    #     else:
    #         BookInfo['size'] = None
    # else:
    #     BookInfo['size'] = None

    # pagesize
    if '215' in dictjson:
        if 'a' in dictjson['215']:
            dictjson['215'] = dictjson['215'].strip('cm')
            start = dictjson['215'].index('a') + 1
            if 'c' in dictjson['215']:
                end = dictjson['215'].index('c')
            elif 'd' in dictjson['215']:
                end = dictjson['215'].index('d')
            elif 'e' in dictjson['215']:
                end = dictjson['215'].index('e')
            else:
                end = -1
            BookInfo['pagesize'] = dictjson['215'][start:end]
        else:
            BookInfo['pagesize'] = None
    else:
        BookInfo['pagesize'] = None



    # print_date
    if '100' in dictjson:
        if 'a' in dictjson['100']:
            start = dictjson['100'].index('a') + 1
            # end = dictjson['100'].index('d')
            end = start + 8
            BookInfo['print_date'] = dictjson['100'][start:end]
        else:
            BookInfo['print_date'] = None
    else:
        BookInfo['print_date'] = None

    # title
    if '200' in dictjson:
        if 'a' in dictjson['200']:
            start = dictjson['200'].index('a') + 1
            end = dictjson['200'].index('b')
            title = dictjson['200'][start:end]
            if '9' in title:
                end = title.index('9')
            BookInfo['title'] = title[0:end]
        else:
            BookInfo['title'] = None
    else:
        BookInfo['title'] = None

    # translators[]

    # authors[]
    BookInfo['authors'] = []
    if '701' in dictjson:
        if 'a' in dictjson['701']:
            start = dictjson['701'].index('a') + 1
            if '9' in dictjson['701']:
                end = dictjson['701'].index('9')
            else:
                end = -1
            BookInfo['authors'].append(dictjson['701'][start:end])
        # else:
        #     BookInfo['authors'] = None


    if '702' in dictjson:
        if 'a' in dictjson['702']:
            start = dictjson['702'].index('a') + 1
            if '9' in dictjson['702']:
                end = dictjson['702'].index('9')
            else:
                end = -1
            BookInfo['authors'].append(dictjson['702'][start:end])
    # if len(BookInfo['authors']) == 0:
    #     print dictjson
    #     exit(2)
    if BookInfo['authors']:
        # print BookInfo['primary_responsible']
        author = BookInfo['primary_responsible']
        BookInfo['authors'].append(author)

    # main_heading
    if '606' in dictjson:
        if 'a' in dictjson['606']:
            start = dictjson['606'].index('a') + 1
            if 'x' in dictjson['606']:
                end = dictjson['606'].index('x')
            elif 'y' in dictjson['606']:
                end = dictjson['606'].index('y')
            elif 'j' in dictjson['606']:
                end = dictjson['606'].index('j')
            elif 'z' in dictjson['606']:
                end = dictjson['606'].index('z')
            else:
                end = len(dictjson['606'])
            main_heading = dictjson['606'][start:end]
            if 'y' in main_heading:
                end = main_heading.index('y')
            else:
                end = len(main_heading)
            BookInfo['main_heading'] = main_heading[0:end]
        else:
            BookInfo['main_heading'] = None
    else:
        BookInfo['main_heading'] = None
    # print BookInfo['main_heading']
    # exit(3)
    # time.sleep(1)

    # tags[]
    # BookInfo['tags'] = []
    if '606' in dictjson:
        dictjson['606'] = dictjson['606'][2:]
        # replace a, x, y, z, j
        dictjson['606'] = dictjson['606'].replace('a', '#')
        dictjson['606'] = dictjson['606'].replace('x', '#')
        dictjson['606'] = dictjson['606'].replace('y', '#')
        dictjson['606'] = dictjson['606'].replace('z', '#')
        dictjson['606'] = dictjson['606'].replace('j', '#')
        BookInfo['tags'] = dictjson['606'].split('#')[1:]

    # year_sub
    if '606' in dictjson:
        if 'z' in dictjson['606']:
            start = dictjson['606'].index('z') + 1
            BookInfo['year_sub'] = dictjson['606'][start:]
    else:
        BookInfo['year_sub'] = None

    # area_sub
    if '606' in dictjson:
        if 'y' in dictjson['606']:
            start = dictjson['606'].index('y') + 1
            if 'z' in dictjson['606']:
                end = dictjson['606'].index('z')
            else:
                end = -1
            BookInfo['year_sub'] = dictjson['606'][start:end]
    else:
        BookInfo['area_sub'] = None

    # yopic_sub
    if '606' in dictjson:
        if 'x' in dictjson['606']:
            start = dictjson['606'].index('x') + 1
            if 'y' in dictjson['606']:
                end = dictjson['606'].index('y')
            else:
                end = -1
            BookInfo['yopic_sub'] = dictjson['606'][start:end]
    else:
        BookInfo['yopic_sub'] = None



    # n_series_title
    if '225' in dictjson:
        if 'a' in dictjson['225']:
            start = dictjson['225'].index('a') + 1
            if 'i' in dictjson['225']:
                end = dictjson['225'].index('i')
            elif 'f' in dictjson['225']:
                end = dictjson['225'].index('f')
            else:
                end = -1
            BookInfo['n_series_title'] = dictjson['225'][start:end]
    else:
        BookInfo['n_series_title'] = None

    # updatetime
    BookInfo['updatetime'] = datetime.datetime.utcnow()
    # createtime
    BookInfo['createtime'] = BookInfo['updatetime']
    # updateuserid
    BookInfo['updateuserid'] = 'admin'
    # createuserid
    BookInfo['createuserid'] = 'admin'
    # version
    BookInfo['version'] = float(round(time.time() * 1000))
    # __v
    BookInfo['__v'] = 0

    return BookInfo


作者:xTao 分类:LNMP 浏览:2525 评论:0