Dedupe去重与实体对齐

简介

Dedupe是一个python库，使用机器学习对结构化数据快速执行模糊匹配，重复数据删除和实体对齐。

输入的数据：单文件csv表格

执行：用户在控制台根据提示标注少量相似数据即可

输出的数据：单文件csv表格，同时对相似的记录打上标签

Dedupe操作实例：

从名称和地址的电子表格中删除重复的条目
将具有客户信息的列表链接到具有订单历史记录的列表，即使没有唯一的客户ID
收集竞选捐款的数据库，并找出同一人所做的捐款，即使每个记录的名称输入略有不同

Python库地址：

https://github.com/dedupeio/dedupe

实例

原始csv文件：

下面的代码将对第三列name去重

代码：

# Site: www.omegaxyz.com
# *_*coding:utf-8 *_*

import os
import csv
import logging
import optparse
import dedupe
import re

reg = r'<p>(.*?)</p>'

def readData(filename):
    """
    Remap columns for the following cases:
    - Lat and Long are mapped into a single LatLong tuple
    - Class and Coauthor are stored as delimited strings but mapped into
      tuples
    """
    data_d = {}
    with open(filename, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for idx, row in enumerate(reader):
            row = dict((k, v.lower()) for k, v in row.items())
            data_d[idx] = row
    return data_d


# These generators will give us the corpora setting up the Set
# distance metrics
def names(data):
    for record in data.values():
        yield record['name']

def types(data):
    for record in data.values():
        yield record['type']


if __name__ == '__main__':
    # ## Logging
    # Dedupe uses Python logging to show or suppress verbose output. Added
    # for convenience.  To enable verbose logging, run `python
    # patent_example.py -v`

    optp = optparse.OptionParser()
    optp.add_option('-v', '--verbose', dest='verbose', action='count',
                    help='Increase verbosity (specify multiple times for more)'
                    )
    (opts, args) = optp.parse_args()
    log_level = logging.WARNING

    if opts.verbose:
        if opts.verbose == 1:
            log_level = logging.INFO
        elif opts.verbose > 1:
            log_level = logging.DEBUG
    logging.getLogger().setLevel(log_level)

    input_file = 'resource_all.csv'

    output_file = 'resource_all_output.csv'
    settings_file = 'resource_all_settings.json'
    training_file = 'resource_all_training.json'
    print('importing data ...')
    data_d = readData(input_file)

    if os.path.exists(settings_file):
        print('reading from', settings_file)
        with open(settings_file, 'rb') as sf:
            deduper = dedupe.StaticDedupe(sf, num_cores=2)
    else:
        # Define the fields dedupe will pay attention to
        '''
            {'field': 'type',
             'variable name': 'type',
             'type': 'Text',
             'corpus': types(data_d),
             'has missing': False},
        '''
        fields = [
            {'field': 'name',
             'variable name': 'name Text',
             'type': 'Text',
             'corpus': names(data_d),
             'has missing': False},
        ]

        # Create a new deduper object and pass our data model to it.
        deduper = dedupe.Dedupe(fields, num_cores=2)
        # If we have training data saved from a previous run of dedupe,
        # look for it an load it in.
        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file) as tf:
                deduper.prepare_training(data_d, training_file=tf)
        else:
            deduper.prepare_training(data_d)
        # ## Active learning

        # Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.

        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print('starting active labeling...')
        dedupe.console_label(deduper)

        deduper.train()

        # When finished, save our training away to disk
        with open(training_file, 'w') as tf:
            deduper.write_training(tf)

        # Save our weights and predicates to disk.  If the settings file
        # exists, we will skip all the training and learning next time we run
        # this file.
        with open(settings_file, 'wb') as sf:
            deduper.write_settings(sf)
    clustered_dupes = deduper.partition(data_d, 0.5)

    print('# duplicate sets', len(clustered_dupes))

    # ## Writing Results

    # Write our original data back out to a CSV with a new column called
    # 'Cluster ID' which indicates which records refer to each other.

    cluster_membership = {}
    for cluster_id, (records, scores) in enumerate(clustered_dupes):
        for record_id, score in zip(records, scores):
            cluster_membership[record_id] = {
                "Cluster ID": cluster_id,
                "confidence_score": score
            }

    with open(output_file, 'w', encoding='utf-8') as f_output, open(input_file, encoding='utf-8') as f_input:
        reader = csv.DictReader(f_input)
        fieldnames = ['Cluster ID', 'confidence_score'] + reader.fieldnames

        writer = csv.DictWriter(f_output, fieldnames=fieldnames)
        writer.writeheader()

        for row_id, row in enumerate(reader):
            row.update(cluster_membership[row_id])
            writer.writerow(row)

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

# Site: www.omegaxyz.com

# *_*coding:utf-8 *_*

import os

import csv

import logging

import optparse

import dedupe

import re

reg = r'<p>(.*?)</p>'

def readData(filename):

"""

Remap columns for the following cases:

- Lat and Long are mapped into a single LatLong tuple

- Class and Coauthor are stored as delimited strings but mapped into

tuples

"""

data_d = {}

with open(filename, 'r', encoding='utf-8') as f:

reader = csv.DictReader(f)

for idx, row in enumerate(reader):

row = dict((k, v.lower()) for k, v in row.items())

data_d[idx] = row

return data_d

# These generators will give us the corpora setting up the Set

# distance metrics

def names(data):

for record in data.values():

yield record['name']

def types(data):

for record in data.values():

yield record['type']

if __name__ == '__main__':

# ## Logging

# Dedupe uses Python logging to show or suppress verbose output. Added

# for convenience. To enable verbose logging, run `python

# patent_example.py -v`

optp = optparse.OptionParser()

optp.add_option('-v', '--verbose', dest='verbose', action='count',

help='Increase verbosity (specify multiple times for more)'

)

(opts, args) = optp.parse_args()

log_level = logging.WARNING

if opts.verbose:

if opts.verbose == 1:

log_level = logging.INFO

elif opts.verbose > 1:

log_level = logging.DEBUG

logging.getLogger().setLevel(log_level)

input_file = 'resource_all.csv'

output_file = 'resource_all_output.csv'

settings_file = 'resource_all_settings.json'

training_file = 'resource_all_training.json'

print('importing data ...')

data_d = readData(input_file)

if os.path.exists(settings_file):

print('reading from', settings_file)

with open(settings_file, 'rb') as sf:

deduper = dedupe.StaticDedupe(sf, num_cores=2)

else:

# Define the fields dedupe will pay attention to

'''

{'field': 'type',

'variable name': 'type',

'type': 'Text',

'corpus': types(data_d),

'has missing': False},

'''

fields = [

{'field': 'name',

'variable name': 'name Text',

'type': 'Text',

'corpus': names(data_d),

'has missing': False},

]

# Create a new deduper object and pass our data model to it.

deduper = dedupe.Dedupe(fields, num_cores=2)

# If we have training data saved from a previous run of dedupe,

# look for it an load it in.

if os.path.exists(training_file):

print('reading labeled examples from ', training_file)

with open(training_file) as tf:

deduper.prepare_training(data_d, training_file=tf)

else:

deduper.prepare_training(data_d)

# ## Active learning

# Starts the training loop. Dedupe will find the next pair of records

# it is least certain about and ask you to label them as duplicates

# or not.

# use 'y', 'n' and 'u' keys to flag duplicates

# press 'f' when you are finished

print('starting active labeling...')

dedupe.console_label(deduper)

deduper.train()

# When finished, save our training away to disk

with open(training_file, 'w') as tf:

deduper.write_training(tf)

# Save our weights and predicates to disk. If the settings file

# exists, we will skip all the training and learning next time we run

# this file.

with open(settings_file, 'wb') as sf:

deduper.write_settings(sf)

clustered_dupes = deduper.partition(data_d, 0.5)

print('# duplicate sets', len(clustered_dupes))

# ## Writing Results

# Write our original data back out to a CSV with a new column called

# 'Cluster ID' which indicates which records refer to each other.

cluster_membership = {}

for cluster_id, (records, scores) in enumerate(clustered_dupes):

for record_id, score in zip(records, scores):

cluster_membership[record_id] = {

"Cluster ID": cluster_id,

"confidence_score": score

}

with open(output_file, 'w', encoding='utf-8') as f_output, open(input_file, encoding='utf-8') as f_input:

reader = csv.DictReader(f_input)

fieldnames = ['Cluster ID', 'confidence_score'] + reader.fieldnames

writer = csv.DictWriter(f_output, fieldnames=fieldnames)

writer.writeheader()

for row_id, row in enumerate(reader):

row.update(cluster_membership[row_id])

writer.writerow(row)

标注少量数据：

程序会自动跳出两行的name段内容，根据你的认知标注这两个name是否为同一个实体，选项包括yes, no, unsure, finish

生成的csv：

可以看到多了两列，一列是聚类号，相同的聚类号为相似实体，还有一列为置信度。

6 评论

头秃的代码狗

2022-02-23 / 17:20 回复

当我使用dedupe的时候，出现AttributeError: ‘Dedupe’ object has no attribute ‘prepare_training’.请问您使用的是什么版本的dedupe呢~ 谢谢
- xyjisaw
  
  2022-02-23 / 19:00 回复
  
  抱歉，我环境变了，你可以查找一下2020-07-24最新的dedupe是哪个版本。
九月

2021-12-03 / 17:08 回复

你好，请问代码具体哪里决定了输出的类型呢？我的数据是中文，但让标注的时候输出是拼音
- xyjisaw
  
  2021-12-06 / 12:59 回复
  
  这是一个调用代码，具体得到dedupe包中看源码
两袖青蛇

2020-09-28 / 13:44 回复

请问可以贴一下dataset吗？
- xyjisaw
  
  2020-09-28 / 17:15 回复
  
  你好，data在这里https://github.com/xyjigsaw/COVID19-KBQA-DEMO

Dedupe去重与实体对齐

简介

实例

大模型AlpacaFarm分析

NLG文本评估任务或许并不需要真值或参考文本

大模型中的RepE表征工程

大模型也是一种优化器（LLM as Optimizer）

全栈开发与快速部署Demo

学术idea自动发现与生成

自回归语言模型（language model）Python实现

粉丝期待的三体电影宇宙（近四十部电影与电视剧集）

基于历史对比学习的时序知识图谱推理

泰拉瑞亚Terriaria快速部署Linux服务器

6 评论

留下评论取消回复

简介

实例

相关文章

6 评论

留下评论取消回复