由bibtex生成引用文献字符串

2023-11-05

word 文档写引用文献，用 mendeley 的插件生成的效果似乎一般，用法见 [1,2]，而且自己改格式的那个网页令人火大（可能我网速问题？）。

用 python 写了个脚本，通过解析 bibtex 来生成，格式自编。项目页见 [4]。

Code

目前不支持中文，没搞掂编码问题…

# -*- coding: utf-8 -*-
import argparse
import chardet
import os
import platform
import re


if "Windows" == platform.system():
    os.system("cls")
elif "Linux" == platform.system():
    os.system("clear")


parser = argparse.ArgumentParser(description='ref')
parser.add_argument('-f', type=str, default="bib.txt", help="input bibtex")
parser.add_argument('-o', type=str, default="ref.txt", help="output ref")
parser.add_argument('--n_name', type=int, default=3,
                    help="#author threshold, <=0 means show all authors")
args = parser.parse_args()

# 会、刊缩写
JC_ABBR = {
    # 会
    "3DV": ["International Conference on 3D Vision"],
    "AAAI": ["AAAI Conference on Artificial Intelligence"],
    "ACM MM": ["ACM International Conference on Multimedia"],
    "BMVC": ["British Machine Vision Conference"],
    "CVPR": ["Computer Vision and Pattern Recognition"],
    "ECCV": ["European Conference on Computer Vision"],
    "ICCV": ["International Conference on Computer Vision"],
    "ICIP": ["International Conference on Image Processing"],
    "ICLR": ["International Conference on Learning Representations"],
    "ICME": ["International Conference on Multimedia & Expo"],
    "ICML": ["International Conference on Machine Learning"],
    "ICMR": ["International Conference on Multimedia Retrieval"],
    "IJCAI": ["International Joint Conference on Artificial Intelligence"],
    "NIPS": ["Conference on Neural Information Processing Systems"],
    "SIGIR": ["Research and Development in Information Retrieval"],
    "SIGKDD": ["Knowledge Discovery and Data Mining"],
    "SIGMOD": ["Conference on Management of Data"],
    "WACV": ["Winter Conference on Applications of Computer Vision"],
    # 刊
    "AI": ["Artificial Intelligence"],
    "Comm. ACM": ["Communications of the ACM",
                 "Commun. ACM"],
    "IJCV": ["International Journal of Computer Vision"],
    "JMLR": ["Journal of Machine Learning Research"],
    "TCSVT": ["Transactions on Circuits and Systems for Video Technology",
              "Trans. Cir. and Sys. for Video Technol."],
    "TIP": ["Transactions on Image Processing"],
    "TMM": ["Transactions on Multimedia"],
    "TNNLS": ["Transactions on Neural Networks and Learning Systems"],
    "TOG": ["Transactions on Graphics",
            "ACM Trans. Graph."],
    "TOMM": ["Transactions on Multimedia Computing, Communications and Application",
             "Trans. Multimedia Comput. Commun. Appl."],
    "TPAMI": ["Transactions on Pattern Analysis and Machine Intelligence"],
}

# 文章类型，目前只用到会、刊两种，书、专利什么的需要再加
# https://www.openoffice.org/bibliographic/bibtex-defs.html
TYPE = {
    "article": "journal",
    "conference": "conference",
    "inproceedings": "conference",
    "proceedings": "conference",
}

# title 中**不**首字母大写的词
STOP_WORD = [
    "a", "an", "and", "as", "by", "for", "in", "into", "of", "on", "onto",
    "through", "the", "to", "under", "using", "via", "with",
]

# in `<prefix>-<word>`, the <word> will NOT be capitalised
PREFIX = [
    "anti", "auto", "multi", "semi", "un", "uni", "weakly"
]

# in `<word>-<suffix>`, the <suffix> will NOT be capitalised
SUFFIX = [
    "based",
]

# 用来匹配 abstract
# 加 `=` 好像会报错
VOCAB = r"\w\s\<\>\[\]\(\)\{\}\"\'\`\^\+\*\~\–\-\/\\\,\.\:\;\!\?\&"


class Cite:
    __slots__ = ["articleno", "author", "booktitle", "number",
        "numpages", "pages", "paper_type", "title", "volume", "year"]


def parse_type(bib):
    # pattern: @<type>{<ref>, ... }
    match_pat = r"\@([a-zA-Z]+)\{([\w\.\/]+)\,.*\}"
    m_obj = re.match(match_pat, bib, re.S)
    assert m_obj is not None
    paper_type = m_obj.group(1).lower()
    print("paper type:", paper_type)
    ref_str = m_obj.group(2)
    print("ref string:", ref_str)

    paper_t = None
    for _t in TYPE:
        if _t == paper_type:
            paper_t = TYPE[_t]
            break
    assert paper_t is not None, \
        "* UNSUPPORTED ARTICLE TYPE: {}".format(paper_type)

    return paper_t


def short_name(name):
    """`Tom`, `tom` -> `T.`"""
    return name.lower().capitalize()[:1] + '.'


def parse_author(s):
    # print(s)
    raw_list = s.split(" and ")
    print("#author:", len(raw_list))

    pat_list = []
    # pattern 0: <F>, <G> [<short M>.]
    pat_list.append(r"\s*([a-zA-Z\-]+)\,\s+([a-zA-Z\-\s\.]+)\s*")
    # pattern 1: <short G>. [<short M>.] {<F>}
    pat_list.append(r"\s*([a-zA-Z\.\s]+)\s+\{([a-zA-Z]+)\}\s*")
    # pattern 2: {<G> <F>}
    pat_list.append(r"\s*\{([a-zA-Z]+)\s+([a-zA-Z]+)\}\s*")
    pat_list = [re.compile(p) for p in pat_list]

    author_list = []
    for aid, a in enumerate(raw_list):
        print(a)
        for pid, p in enumerate(pat_list):
            m = p.match(a)
            if m is not None:
                if 0 == pid:
                    fn = m.group(1)
                    gn = m.group(2).split(' ')
                    if len(gn) == 1:
                        gn = gn[0]
                        if ("Li" == gn) and ("Fei-Fei" == fn):
                            fn, gn = gn, fn
                        gn = short_name(gn)
                        name = "{} {}".format(gn, fn)
                    else:
                        name = ""
                        for _gn in gn:
                            name += short_name(_gn) + ' '
                        name += fn
                elif 1 == pid:
                    fn = m.group(2)
                    gn = m.group(1).split(' ')
                    name = ""
                    for _gn in gn:
                        name += _gn + ' '
                    name += fn
                elif 2 == pid:
                    fn = m.group(2)
                    gn = short_name(m.group(1))
                    name = "{} {}".format(gn, fn)

                author_list.append(name)
                print(aid + 1, '|', name)
                break

    assert len(author_list) > 0, "* NO AUTHOR"
    return author_list


def less_author(author_list):
    """shows the first `n_name` authors only"""
    _m = len(author_list)
    if args.n_name > 0:
        _m = min(args.n_name, _m)

    _author = ""
    for i in range(_m):
        _author += author_list[i]
        if i < _m - 1:
            _author += ", "
    if len(author_list) > _m:
        _author += ", et al"
    return _author


def parse_page(s):
    m = re.match(r"(\d+)\D*(\d+)", s)
    assert m is not None
    begin = int(m.group(1))
    end = int(m.group(2))
    print("pages:", begin, ',', end)
    return (begin, end)


def parse_hyphen_word(word):
    """deal with the hythen word in title"""
    # pattern: w1-w2[-w3...]
    w_list = word.split('-')
    m = len(w_list)
    assert m > 1
    # pascle case
    pat_pascle = re.compile(r"[A-Z\d]+[a-z\d]+[A-Z]\w*\:?")
    # all caps
    pat_all_cap = re.compile(r"[A-Z\d]+\:?")

    res = ""
    for i, _w in enumerate(w_list):
        if (pat_pascle.match(_w) is None) and (pat_all_cap.match(_w) is None):
            if 0 == i:
                _w = _w.lower().capitalize()
            elif (_w not in STOP_WORD) and (w_list[i-1] not in SUFFIX) and \
                    (_w not in SUFFIX):
                _w = _w.lower().capitalize()
        res += _w + '-'

    return res[:-1]


def parse_title(title):
    w_list = [w for w in title.split(' ')]
    # pascle case
    pat_pascle = re.compile(r"[A-Z\d]+[a-z\d]+[A-Z]\w*\:?")
    # all caps
    pat_all_cap = re.compile(r"[A-Z\d]+\:?")

    res = ""
    for i, _w in enumerate(w_list):
        # print(_w)
        if '-' in _w:
            _w = parse_hyphen_word(_w)
        elif (pat_pascle.match(_w) is None) and (pat_all_cap.match(_w) is None):
            if (0 == i) or (_w not in STOP_WORD):
                _w = _w.lower().capitalize()
        res += _w + ' '

    res = res.strip()
    print("title:", res)
    return res


def parse_booktitle(s):
    s_low = s.lower()
    bt = None
    for _abbr in JC_ABBR:
        if _abbr in s:
            bt = _abbr
            break
        for _name in JC_ABBR[_abbr]:
            if _name.lower() in s_low:
                bt = _abbr
                break
        if bt is not None:
            break

    assert bt is not None, \
        "* UNRECOGNISED CONFERENCE/JOURNAL: {}".format(s)
    print("booktitle:", bt)
    return bt


def gen_ref(cite_obj):
    _author = less_author(cite_obj.author)
    res = "{}. {}".format(_author, cite_obj.title)

    if "conference" == cite_obj.paper_type:
        res += "[C]//{} {}".format(cite_obj.booktitle, cite_obj.year)
    elif "journal" == cite_obj.paper_type:
        res += "[J]. {}, {}".format(cite_obj.booktitle, cite_obj.year)
    else:
        print("* UNSUPPORTED ARTICLE TYPE")
        return None

    if hasattr(cite_obj, "volume"):
        res += ", {}".format(cite_obj.volume)
        if hasattr(cite_obj, "number"):
            res += "({})".format(cite_obj.number)

    if hasattr(cite_obj, "pages"):
        res += ": {}-{}".format(*cite_obj.pages)
    elif hasattr(cite_obj, "articleno") and hasattr(cite_obj, "numpages"):
        res += ": {0}:1-{0}:{1}".format(
            cite_obj.articleno, cite_obj.numpages)

    res += "."
    return res


bib = ""
with open(args.f, "r") as f:
    for line in f:
        bib += line.strip()


cite = Cite()
cite.paper_type = parse_type(bib)


# pattern: <key> = {<value>}[,}\s]
# match the last redundant `[,}\s]` (i.e. `,` or `}` or white space) for convenience
item_pat = re.compile(r"\w+\s*\=\s*[\{\"][" + VOCAB + r"]*[\}\"][\,\}\s]")
# print(item_pat)
item_list = item_pat.findall(bib)
print("#item:", len(item_list))
# for i, _it in enumerate(item_list):
    # print(i + 1, '|', _it, '\n')

item_split_pat = re.compile(r"(\w+)\s*\=\s*[\{\"]([" + VOCAB + r"]*)[\}\"]")
for i, _item in enumerate(item_list):
    # print(i + 1, '|', _item)
    # simply remove the last redundant character
    m_obj = item_split_pat.match(_item[:-1])
    assert m_obj is not None
    k = m_obj.group(1).lower()
    v = m_obj.group(2)
    # print(i + 1, '|', k, '|', v, '\n')
    if "" == v:
        continue

    if "author" == k:
        cite.author = parse_author(v)
    elif "title" == k:
        cite.title = parse_title(v)
    elif "year" == k:
        cite.year = int(v)
        print("year:", cite.year)
    elif "volume" == k:
        cite.volume = int(v)
        print("volume:", cite.volume)
    elif "number" == k:
        cite.number = int(v)
        print("number:", cite.number)
    elif "pages" == k:
        cite.pages = parse_page(v)
    elif "articleno" == k:
        cite.articleno = int(v)
        print("article No.:", cite.articleno)
    elif "numpages" == k:
        cite.numpages = int(v)
        print("num pages:", cite.numpages)
    elif "booktitle" == k:
        cite.booktitle = parse_booktitle(v)
    elif "journal" == k:
        assert "journal" == cite.paper_type
        cite.booktitle = parse_booktitle(v)

ref = gen_ref(cite)
print("\n{}".format(ref))
with open(args.o, "w") as f:
    f.write("{}\n".format(ref))

if "Windows" == platform.system():
    os.system("start {}".format(args.o))

References

本文内容由网友自发贡献，版权归原作者所有，本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容，请联系:hwhale#tublm.com(使用前将#替换为@)

乱搞

latex

引用

python

regex

由bibtex生成引用文献字符串的相关文章

匹配所有有效格式 IPv6 地址的正则表达式

乍一看我承认这个问题看起来像是这个问题以及与之相关的任何其他问题的重复匹配有效 IPv6 地址的正则表达式 https stackoverflow com questions 53497 regular expression that
str.translate 与 str.replace - 何时使用哪一个？

何时以及为什么使用前者而不是后者反之亦然目前尚不完全清楚为什么有些人使用前者以及为什么有些人使用后者它们有不同的目的 translate只能用任意字符串替换单个字符但一次调用可以执行多次替换它的参数是一个特殊的表它将单个字符映射
Python 转换矩阵

我有一个如下所示的列表 2 1 3 1 2 3 1 2 2 2 我想要的是一个转换矩阵它向我显示如下序列 1 后跟 1 的频率是多少 1 后面跟着 2 的频率是多少 1 后跟 3 的频率是多少 2 后跟 1 的频率是多少 2 后跟 2 的
来自 pandas 数据帧的烛台图，用日期替换索引

此代码给出了带有移动平均线的烛台图但 x 轴位于索引中我需要 x 轴位于日期中需要做什么改变 import numpy as np import pandas as pd import matplotlib pyplot as plt
雅虎财务请求功能出现 404 客户端错误

yahoo Financials的请求功能出现404 Client Error 直接点击以下网址没有问题 https finance yahoo com quote AAPL financials p AAPL https finance
带有 mkdocs 的本地 mathjax

我想在无法访问互联网的计算机上使用 MathJax 和 Mkdocs 因此我不能只调用 Mathjax CDN Config mkdocs yml site name My Docs extra javascript javascripts
Py2exe - Pmw WindowsError：[错误 3]

我正在尝试使用 Py2exe 构建独立的可执行文件我已经导入了 Pmw 类当我运行独立可执行文件时出现以下错误 Traceback most recent call last File py line 9 in
在请求中设置端口

我正在尝试利用cgminer使用 Python 的 API 我对利用requests图书馆我了解如何做基本的事情requests but cgminer想要更具体一点我想缩小 import socket import json sock
高级描述熊猫

有没有像 pandas 那样更高级的功能通常我会继续这样 r pd DataFrame np random randn 1000 columns A r describe 我会得到一份很好的总结就像这样 A count 1000 000
Python 属性和 Swig

我正在尝试使用 swig 为一些 C 代码创建 python 绑定我似乎遇到了一个问题试图从我拥有的一些访问器函数创建 python 属性方法如下 class Player public void entity Entity enti
Seaborn 中没有线性拟合的散点图

我想知道是否有办法关闭seaborn中的线性拟合lmplot或者是否有一个等效函数可以生成散点图当然我也可以使用 matplotlib 但是我发现 seaborn 中的语法和美学非常吸引人例如我想绘制以下情节 import sea
根据特定字符获取整个字符串或子字符串

我有一个包含 MIME 类型的字符串例如application json 现在我想将其与实际的 HTTP 标头进行比较在本例中content type 如果标头包含 MIME 类型那么就很简单 if mimeType contentT
如何获取分类数据的分组条形图

I have a big dataset with information about students And I have to build a graph of dependencies between different value
如何按 pandas 中的值对系列进行分组？

我现在有一只熊猫Series与数据类型Timestamp 我想按日期对其进行分组并且每组中有许多行具有不同的时间看似显而易见的方法类似于 grouped s groupby lambda x x date 然而熊猫的groupby按索
如何在matplotlib中调整x轴

I have a graph like this x轴上的数据表示小时所以我希望x轴设置为0 24 48 72 而不是现在的值很难看到 0 100 之间的数据 fig1 plt figure ax fig1 add subplot 11
从 wxPython 事件处理程序中调用函数

我正在努力寻找一种在 wxPython 事件处理函数中使用函数的方法假设我有一个按钮单击该按钮时它会使用事件处理程序运行一个名为 OnRun 的函数但是用户忘记单击 OnRun 按钮之前的 RadionButton 我想弹出一个
Django 将 JSON 数据传递给静态 getJSON/Javascript

我正在尝试从 models py 中获取数据并将其序列化为views py 中的 JSON 对象模型 py class Platform models Model platformtype models CharField max len
无需访问 Internet 即可部署 Django 的简单方法？

我拥有的是使用 Django 开发的 Intranet 站点的开发版本以及放置在 virtualenv 中的一些外部库它运行良好我可以在任何具有互联网连接的计算机上使用相同的参数使用 pip 轻松设置 virtualenv 但是不幸
如何使用 enumerate 来倒数？

letters a b c 假设这是我的清单在哪里for i letter in enumerate letters 将会 0 a 1 b 2 c 我怎样才能让它向后枚举如 2 a 1 b 0 c 这是一个很好的解决方案并且工作完美 i
在Python中停止ThreadPool中的进程

我一直在尝试为控制某些硬件的库编写一个交互式包装器用于 ipython 有些调用对 IO 的影响很大因此并行执行任务是有意义的使用 ThreadPool 几乎效果很好 from multiprocessing pool import

随机推荐

期刊和会议论文的区别

文章目录期刊主要用途标志会议论文标志区分期刊主要用途评审职称硕士小论文发表标志期刊论文著录有作者论文题名期刊名出版年卷 volume 期 Issue 起止页码等信息一般期刊以年为单位称为卷在每年中又有很多
在vue项目中优雅的使用Svg

本文主要以 vue cli3 搭建的项目为例来聊一下如何在项目中更优雅的使用 svg 众所周知 vue cli3 已经推出很长一段时间了大家可以感受一下 vue cli3 带来的零配置体验 But 也相应带来了一些弊端就是如归需要修改
【星海随笔】SQL的基础操作

架构话查询语言 Structured Query Language DDL 数据定义语言 dataDefinition Language CREATE ALTER DROP DML 数据操作语言 data Manipulation Lang
如何用CSS实现响应式布局

在今天的移动互联网的时代越来越多的人通过手机或平板等移动设备访问网站因此实现响应式布局变得越来越重要在这篇博客中我们将学习如何使用CSS实现响应式布局什么是响应式布局响应式布局是指在不同尺寸和设备上自适应调整页面显示效果的能力
图片上传几种方式总结

1 html表单上传不推荐最传统的图片上传方式是form表单上传使用form表单的input type file 控件打开系统的文件选择对话框从而达到选择文件并上传的目的 form表单上传表单上传需要注意以下几点 1 提供for
【CV】使用 OpenCV 将照片变成卡通

大家好我是Sonhhxg 柒希望你看完之后能对你有所帮助不足请指正共同学习交流个人主页 Sonhhxg 柒的博客 CSDN博客欢迎各位点赞收藏留言系列专栏机器学习 ML 自然语言处理 NLP 深度学习 DL fore
i.MX6ULL - 问题解决：NFS挂载失败 - VFS: Unable to mount root fs on unknown-block(2,0)

i IMX6ULL 问题解决 NFS挂载失败 VFS Unable to mount root fs on unknown block 2 0 开发环境移植的linux5 4 7 0 ubuntu1804 x64 arm linux gn
毕业设计-机器视觉深度学习的视频去水印算法

目录前言课题背景和意义实现技术思路实现效果图样例前言大四是整个大学期间最忙碌的时光一边要忙着备考或实习为毕业后面临的就业升学做准备一边要为毕业设计耗费大量精力近几年各个学校要求的毕设项目越来越难有不少课题是研究生级别难度
MFC视频教程(孙鑫)学习笔记2-掌握C++

这一集中主要总结了C 经典语法与应用 1 C 的三大特性封装继承多态 2 C 中提供了一套输入输出流类的对象它们是cin cout和cerr 对应c语言中的三个文件指针stdin stdout stderr 分别指向终端输入终端
Ubuntu下网页打开速度缓慢的解决方法

Ubuntu下网页打开速度缓慢的解决方法网速较慢可能是网络配置的原因导致解决步骤如下以下指令均在Ubuntu终端输入执行一查看Ubuntu版本信息 lsb release a 二使用pdnsd软件为本机搭建DNS代理服务器 1
Redis第二十讲 Redis哨兵自动故障转移与优缺点

sentinel哨兵是特殊的redis服务不提供读写服务主要用来监控redis实例节点哨兵架构下client端第一次从哨兵找出redis的主节点后续就直接访问redis的主节点不会每次都通过sentinel代理访问redis的主节
ES 聚合函数的用法

1 ES聚合分析是什么聚合分析是数据库中重要的功能特性完成对一个查询的数据集中数据的聚合计算如找出某字段或计算表达式的结果的最大值最小值计算和平均值等 ES作为搜索引擎兼数据库同样提供了强大的聚合分析能力对一个数据集求
K和KB的区别

来源综合自己和网上的观点问题1 K与KB之间有什么区别我在做一道解时就是某计算机字长16位它的存储容量是1MB 按字编址这经的寻址范围是 A 512K B 1M C 512KB 答案给的是A 我很不解为什么512K与512K
(error) CROSSSLOT Keys in request don‘t hash to the same slot 解决方法

Redis 哈希槽基本概念哈希槽 hash slot 是来自Redis Cluster的概念但在各种集群方案都有使用哈希槽是一个key的集合 Redis集群共有16384个哈希槽每个key通过CRC16散列然后对16384进行取模来
Python opencv 机器学习 5.knn pca降维 ocr识别数字 mnist数据集

coding utf 8 from numpy import import numpy as np import struct import matplotlib pyplot as plt import operator 定义一个全局特征
轻松获取在线媒体：视频下载工具推荐

ytdl org youtube dl Stars 121 0k License Unlicense youtube dl 一个命令行程序可以从YouTube com和其他视频网站下载视频基于 Python 实现你可以在Unix Wi
上班之路华为OD真题 200

public class Main public static char map 地图 public static int t 转弯次数 public static int c 路障个数 public static int n 地图行数 p
Android 项目必备（十一）--＞轮询操作

文章目录前言实战前言什么叫轮询请求简单理解就是 App 端每隔一定的时间重复请求的操作就叫做轮询请求比如 App 端每隔一段时间上报一次定位信息 App 端每隔一段时间拉去一次用户状态等这些应该都是轮询请求为何不用长连接代替
ibm服务器开机后显示器闪烁,IBM E50彩色显示器，开机后电源指示灯闪烁，机内有“咔嗒”声，黑屏...

经观察发现咔嗒声是消磁继电器断开闭合的声音经测量该继电器13V供电电压时有时无该故障现象特别不易判断但从屏幕无显示这一故障现象入手可初步判定故障范围可能在电源电路和行扫描电路首先不开机直观检查相关电路未见异常然后测
由bibtex生成引用文献字符串

word 文档写引用文献用 mendeley 的插件生成的效果似乎一般用法见 1 2 而且自己改格式的那个网页令人火大可能我网速问题用 python 写了个脚本通过解析 bibtex 来生成格式自编项目页见 4 Code 目前

由bibtex生成引用文献字符串

References

由bibtex生成引用文献字符串 的相关文章

随机推荐

热门标签

由bibtex生成引用文献字符串的相关文章