Sunday, 16 October 2016

Implement Apriori approach for datamining to organize the data items on a shelf using the dataset.csv file of items purchased in a Mall by using Python


PROGRAM

filename: apriori.py


from collections import defaultdict


def readFile(fn):

 f = open(fn,"r")
 for line in f:
  line = line.strip().rstrip(",")
  record = frozenset(line.split(','))
  yield record

def returnItemSetTL(data_iter):

 _itemSet = set()
 TransactionList = []
 for record in data_iter:
  trans = frozenset(record)
  TransactionList.append(trans)
  for item in trans:
   _itemSet.add(frozenset([item]))
 return _itemSet,TransactionList 

def returnItemWithMinSupport(itemSet,TransactionList,ms,freqSet):

 localSet = defaultdict(int)
 _itemSet = set()
 for item in itemSet:
  for trans in TransactionList:
   if item.issubset(trans):
    localSet[item]+=1
    freqSet[item]+=1
 for item,count in localSet.items():
  support = float(count)/float(len(TransactionList))
  if support>=minSupport:
   _itemSet.add(item)
 return _itemSet    

def joinSet(itemSet,length):

 return set([i.union(j) for i in itemSet for j in itemSet if len(i.union(j))==length])

def getSupport(freqSet,item,TransactionList):

 return float(freqSet[item])/float(len(TransactionList))

def runApriori(data_iter,ms,mc):

 itemSet,TransactionList = returnItemSetTL(data_iter)

 freqSet = defaultdict(int)

 largeSet = {}
 oneCSet = returnItemWithMinSupport(itemSet,TransactionList,ms,freqSet)
 k=2
 currentLSet = oneCSet
 print "L1"
 print currentLSet
 while(currentLSet!=set([])):
  largeSet[k-1]=currentLSet
  currentLSet = joinSet(currentLSet,k)
  print "L",k
  print currentLSet
  currentCSet = returnItemWithMinSupport(currentLSet,TransactionList,ms,freqSet)
  currentLSet = currentCSet
  k+=1
 items = []
 for key,value in largeSet.items():
  items.extend([(tuple(item),getSupport(freqSet,item,TransactionList)) for item in value])
 print items
 max_value = 0
 for item,support in sorted(items,key=lambda(item,support):support):
  if max_value<len(item):
   max_value=len(item)
 for item,support in sorted(items,key=lambda(item,support):support):
  if max_value==len(item):
    print item


if __name__=='__main__':

 inFile = readFile("dataset.csv")
 minSupport = 0.6
 minConfidence = 1
 runApriori(inFile,minSupport,minConfidence)


filename: dataset.csv


Mango,Onion,Jar,Keychain,Eggs,Chocolate

Nuts,Onion,Jar,Keychain,Eggs,Chocolate
Mango,Apple,Keychain,Eggs
Mango,Toothbrush,Corn,Keychain,Chocolate
Corn,Onion,Onion,Keychain,Knife,Eggs

OUTPUT


Ccpvg@ccpvg-HP-Compaq-4000-Pro-SFF-PC:~$python apriori.py

L1
set([frozenset(['Onion']), frozenset(['Chocolate']), frozenset(['Keychain']), frozenset(['Eggs']), frozenset(['Mango'])])
L 2
set([frozenset(['Mango', 'Keychain']), frozenset(['Eggs', 'Onion']), frozenset(['Onion', 'Chocolate']), frozenset(['Mango', 'Chocolate']), frozenset(['Keychain', 'Chocolate']), frozenset(['Eggs', 'Chocolate']), frozenset(['Keychain', 'Onion']), frozenset(['Mango', 'Onion']), frozenset(['Eggs', 'Keychain']), frozenset(['Eggs', 'Mango'])])
L 3
set([frozenset(['Eggs', 'Keychain', 'Onion']), frozenset(['Eggs', 'Keychain', 'Chocolate']), frozenset(['Mango', 'Keychain', 'Chocolate']), frozenset(['Mango', 'Keychain', 'Onion']), frozenset(['Eggs', 'Mango', 'Keychain']), frozenset(['Keychain', 'Onion', 'Chocolate'])])
L 4
set([])
[(('Onion',), 0.6), (('Chocolate',), 0.6), (('Keychain',), 1.0), (('Eggs',), 0.8), (('Mango',), 0.6), (('Eggs', 'Keychain'), 0.8), (('Mango', 'Keychain'), 0.6), (('Eggs', 'Onion'), 0.6), (('Keychain', 'Onion'), 0.6), (('Keychain', 'Chocolate'), 0.6), (('Eggs', 'Keychain', 'Onion'), 0.6)]
('Eggs', 'Keychain', 'Onion')


No comments:

Post a Comment