python3利用Apriori算法实现关联度分析
Apriori不能像其他模块一样直接pip安装,下面是网上找来的算法,然后封装成可直接使用的代码
class Apriori():
def __init__(self,data,min_support,min_confidence):
self.samples = data
self.min_support = min_support
self.min_confidence = min_confidence
self.fre_list = list()
self.record_list = list()
self.record_dict = dict()
def get_c1(self):
new_dict = dict()
for row in self.samples:
for item in row:
if item not in self.fre_list:
self.fre_list.append(item)
new_dict[item] = 1
else:
new_dict[item] = new_dict[item] + 1
self.fre_list.sort()
print("candidate set:")
self.print_dict(new_dict)
#去除不满足最小支持度的项集
for key in self.fre_list:
if new_dict[key] < self.min_support:
del new_dict[key]
print("after pruning:")
self.print_dict(new_dict)
self.record_list = self.fre_list
self.record_dict = self.record_dict
def get_candidateset(self):
new_list = list()
# 自连接
for i in range(0, len(self.fre_list)):
for j in range(0, len(self.fre_list)):
#防止一样的搞在一起
if i == j:
continue
# 如果两个k项集可以自连接,必须保证它们有k-1项是相同的
if self.has_samesubitem(self.fre_list[i], self.fre_list[j]):
curitem = self.fre_list[i] + ',' + self.fre_list[j]
curitem = curitem.split(",")
#去重
curitem = list(set(curitem))
curitem.sort()
curitem = ','.join(curitem)
# 如果一个k项集要成为候选集,必须保证它的所有子集都是频繁的
if self.has_infresubset(curitem) == False and self.already_constains(curitem, new_list) == False:
new_list.append(curitem)
new_list.sort()
return new_list
def has_samesubitem(self,str1, str2):
str1s = str1.split(",")
str2s = str2.split(",")
if len(str1s) != len(str2s):
return False
nums = 0
for items in str1s:
if items in str2s:
nums += 1
str2s.remove(items)
if nums == len(str1s) - 1:
return True
else:
return False
def judge(self,candidatelist):
# 计算候选集的支持度
new_dict = dict()
for item in candidatelist:
new_dict[item] = self.get_support(item)
print("candidate set:")
self.print_dict(new_dict)
# 剪枝
# 频繁集的支持度要大于最小支持度
new_list = list()
for item in candidatelist:
if new_dict[item] < self.min_support:
del new_dict[item]
continue
else:
new_list.append(item)
self.fre_list = new_list
print("after pruning:")
self.print_dict(new_dict)
return new_dict
def has_infresubset(self,item):
# 由于是逐层搜索的,所以对于Ck候选集只需要判断它的k-1子集是否包含非频繁集即可
subset_list = self.get_subset(item.split(","))
for item_list in subset_list:
if self.already_constains(item_list, self.fre_list) == False:
return True
return False
def get_support(self,item, splitetag=True):
if splitetag:
items = item.split(",")
else:
items = item.split("^")
support = 0
for row in self.samples:
tag = True
for curitem in items:
if curitem not in row:
tag = False
continue
if tag:
support += 1
return support
def get_fullpermutation(self,arr):
if len(arr) == 1:
return [arr]
else:
newlist = list()
for i in range(0, len(arr)):
sublist = self.get_fullpermutation(arr[0:i] + arr[i + 1:len(arr)])
for item in sublist:
curlist = list()
curlist.append(arr[i])
curlist.extend(item)
newlist.append(curlist)
return newlist
def get_subset(self,arr):
newlist = list()
for i in range(0, len(arr)):
arr1 = arr[0:i] + arr[i + 1:len(arr)]
newlist1 = self.get_fullpermutation(arr1)
for newlist_item in newlist1:
newlist.append(newlist_item)
newlist.sort()
newlist = self.remove_dumplicate(newlist)
return newlist
def remove_dumplicate(self,arr):
newlist = list()
for i in range(0, len(arr)):
if self.already_constains(arr[i], newlist) == False:
newlist.append(arr[i])
return newlist
def already_constains(self,item, curlist):
items = list()
if type(item) == type('aa'):
items = item.split(",")
else:
items = item
for i in range(0, len(curlist)):
curitems = list()
if type(curlist[i]) == type('aa'):
curitems = curlist[i].split(",")
else:
curitems = curlist[i]
if len(set(items)) == len(curitems) and len(list(set(items).difference(set(curitems)))) == 0:
return True
return False
def print_dict(self,curdict):
keys = curdict.keys()
# keys.sort()
for curkey in keys:
print("%s:%s" % (curkey, curdict[curkey]))
# 计算关联规则的方法
def get_all_subset(self,arr):
rtn = list()
while True:
subset_list = self.get_subset(arr)
stop = False
for subset_item_list in subset_list:
if len(subset_item_list) == 1:
stop = True
rtn.append(subset_item_list)
if stop:
break
return rtn
def get_all_subset(self,s):
from itertools import combinations
return sum(map(lambda r: list(combinations(s, r)), range(1, len(s) + 1)), [])
def cal_associative_rule(self,frelist):
rule_list = list()
rule_dict = dict()
for fre_item in frelist:
fre_items = fre_item.split(",")
subitem_list = self.get_all_subset(fre_items)
for subitem in subitem_list:
# 忽略为为自身的子集
if len(subitem) == len(fre_items):
continue
else:
difference = set(fre_items).difference(subitem)
rule_list.append("^".join(subitem) + "->" + "^".join(difference))
print("The rule is:")
for rule in rule_list:
conf = self.cal_rule_confidency(rule)
print(rule, conf)
if conf >= self.min_confidence:
rule_dict[rule] = conf
print("The associative rule is:")
for key in rule_list:
if key in rule_dict.keys():
print(key, ":", rule_dict[key])
def cal_rule_confidency(self,rule):
rules = rule.split("->")
support1 = self.get_support("^".join(rules), False)
support2 = self.get_support(rules[0], False)
if support2 == 0:
return 0
rule_confidency = float(support1) / float(support2)
return rule_confidency
def doData(self):
# record_list = list()
# record_dict = dict()
self.get_c1()
# 不断进行自连接和剪枝,直到得到最终的频繁集为止;终止条件是,如果自连接得到的已经不再是频繁集
# 那么取最后一次得到的频繁集作为结果
while True:
self.record_list = self.fre_list
new_list = self.get_candidateset()
judge_dict = self.judge(new_list)
if len(judge_dict) == 0:
break
else:
self.record_dict = judge_dict
print("The final frequency set is:")
print(self.record_list)
self.cal_associative_rule(self.record_list)
if __name__ == '__main__':
#元素必须是字符,
#apriori算法原理(http://blog.csdn.net/baimafujinji/article/details/53456931)
data = [
["I2", "I4"],
["I2", "I3",'I6'],
["I1", "I2", "I4"],
["I1", "I2","I3"],
["I2", "I3",'I6'],
["I1", "I2"],
["I1", "I2", "I3",'I6'],
]
#最小支持度(这里的2是指出现的次数并不是真正意义上的支持度)用来过滤低于这个值的集合
min_support = 2
#最小置信度(参考文章http://blog.csdn.net/wo334499/article/details/51698810)
min_confidence = 0.6
#Apriori这个类可以单独封装成一个文件
apriori = Apriori(data,min_support,min_confidence)
apriori.doData()
#以下是返回的结果
# candidate set:
# I2:7 #I2表示元素集合,7表示改元素出现的次数(即上方的min_support)
# I4:2
# I3:4
# I6:3
# I1:4
# after pruning:
# I2:7
# I4:2
# I3:4
# I6:3
# I1:4
# candidate set:
# I1,I2:4
# I1,I3:2
# I1,I4:1
# I1,I6:1
# I2,I3:4
# I2,I4:2
# I2,I6:3
# I3,I4:0
# I3,I6:3
# I4,I6:0
# after pruning:
# I1,I2:4
# I1,I3:2
# I2,I3:4
# I2,I4:2
# I2,I6:3
# I3,I6:3
# candidate set:
# I1,I2,I3:2
# I2,I3,I6:3
# after pruning:
# I1,I2,I3:2
# I2,I3,I6:3
# candidate set:
# after pruning:
# The final frequency set is:
# ['I1,I2,I3', 'I2,I3,I6']
# The rule is:
# I1->I3^I2 0.5 #表示在出现I1的情况下出现I3和I2的概率也就是置信度为0.5
# I2->I3^I1 0.2857142857142857
# I3->I2^I1 0.5
# I1^I2->I3 0.5
# I1^I3->I2 1.0
# I2^I3->I1 0.5
# I2->I3^I6 0.42857142857142855
# I3->I6^I2 0.75
# I6->I3^I2 1.0
# I2^I3->I6 0.75
# I2^I6->I3 1.0
# I3^I6->I2 1.0
# The associative rule is:
# I1^I3->I2 : 1.0
# I3->I6^I2 : 0.75
# I6->I3^I2 : 1.0
# I2^I3->I6 : 0.75
# I2^I6->I3 : 1.0
# I3^I6->I2 : 1.0