#!/usr/bin/env python

# DONALD MINER

import sys
import re
import datetime
import random

GIVE_OUTPUT = False
GIVE_DISTROBUTION_STATS = False
FULL_PERCOLATION = False
SKIP_SUBSET = False
SKIP_DIRECTORY = True
ASSUME_TRANSITIVITY = False
DO_CLOSURE = False
PROBABILITY_OF_FAILURE = 0
PERFORM_CHECKS = False
SHUFFLE_FILES = False
PRINT_RANKS_TO_FILE = False
BRUTE_FORCE_INDEX = 78

program_start = datetime.datetime.now()


#################### DIRECTORY TREE ##########################################
def get_directories( verbose=False ):
	files = open(sys.argv[1])

	directories = {}
	directories_list = []

	i = 0
	if GIVE_OUTPUT: sys.stdout.write("Number of files and directories:  ")

	start = datetime.datetime.now()

	for path in files:
		dir_iter = directories
		path = path.rstrip().split("/")[1:]

		for directory in path[:-1]:
			dir_iter = dir_iter[directory]
		dir_iter[path[-1]] = {}

		directories_list += dir_iter

		i += 1
		if GIVE_OUTPUT: sys.stdout.write("\b"*len(str(i-1)))
		if GIVE_OUTPUT: sys.stdout.write("%d" % i)
	if GIVE_OUTPUT: sys.stdout.write("\n")

	logfile.write("Generate directory graph: %d seconds, %d files and directories\n" % ((datetime.datetime.now() - start).seconds, i))

	return directories, directories_list

def get_directories_list():
	return [ file_name.strip() for file_name in open(sys.argv[1]).readlines() ]

########### SUBSET GRAPH #####################################################
# TODO
#  * Give the option of precompiling - LOW PRIORITY
#    * Have to write precompile functions, maybe wrap the check_X, pre_X
#      and have check_X be able to work with the precompiled version.
#    * Pass the precompile functions a lot like how check is right now.
#    * Maybe wrap the precompile and the check functions in a class.
#  * Write the split_regex function that splits all possibilities of a regular
#    expression. This will be very hard to code and incorperate.
#  * Give the opton to use bit-vectors in the the RegexElement instead of
#    lists; requiers some integration as well because it needs to know how
#    many other elements there are.
#    * Might be a good idea to allow either sets or bitvectors. Bitvectors are
#      more efficient for large sets and regular lists are more efficient for
#      smaller sets. Supersets, subsets and intersects would be better fit for
#      regular lists and bitvectors would be good for dijoint.
#  * Keep a bitvector to see if a comparison has been made and get rid of 'maybe'
#  * Give different amounts of percolation.
#  * WATCH OUT: My approximations are not transitive. Thus not everything can
#    percolate!

# FEBRUARY 6 TODO
#  * implement transitive closure at the end.
#  * 
class RegexElement:
	def __init__(self, string, index):
		self.string = string
		self.supersets = []
		self.subsets = []
		self.disjoints = []
		self.intersects = []
		self.maybes = []
		self.precompilation = {}
		self.compiled = re.compile("^" + string + "$")
		self.index = index

SUPERSET = 1 # A IS A SUPERSET OF B
SUBSET = 2 # A IS A SUBSET OF B
INTERSECT = 3
DISJOINT = 4
EQUAL = 5

# TODO - Write precomple functions
def check_equal(a, b):
	if a == b:
		return EQUAL

def check_prefixes(a, b):
	for i in range(len(a)):
		if i == len(b):
			break
		if a[i] in "(.[" or b[i] in "(.[)":
			break
		if a[i] != b[i]:
			return DISJOINT

def check_suffixes(a, b):
	for i in range(len(a)):
		if i == len(b):
			break
		if a[-i-1] in ").]*+}?":
			try: 
				if a[-i-2] == "\\":
					continue
			except IndexError:
				pass
			break
		elif b[-i-1] in ").]*+}?":
			try:
				if b[-i-2] == "\\":
					continue
			except IndexError:
				pass
			break
		elif a[-i-1] != b[-i-1]:
			return DISJOINT

def check_dotstar(a, b):
	astar = False
	bstar = False

	if a.endswith(".*"):
		a = a[:-2]
		astar = True
	elif a.endswith("(/.*)?"):
		a = a[:-6]
		astar = True

	if b.endswith(".*"):
		b = b[:-2]
		bstar = True
	elif b.endswith("(/.*)?"):
		b = b[:-6]
		bstar = True

	if astar and bstar:
		if a.startswith(b):
			return SUBSET
		elif b.startswith(a):
			return SUPERSET
	elif bstar and a.startswith(b):
		return SUBSET
	elif astar and b.startswith(a):
		return SUPERSET

def check_fixed(a, b):
	a_fixed = True
	b_fixed = True

	i = 0
	while i < len(a):
		if a[i] == "\\":
			i += 1
		if a[i] in r"([.*{?+":
			a_fixed = False
		i += 1

	i = 0
	while i < len(b):
		if b[i] == "\\":
			i += 1
		if b[i] in r"([.*{?+":
			b_fixed = False
		i += 1

	if a_fixed and not b_fixed:
		if re.match(b, a):
			return SUBSET
	elif b_fixed and not a_fixed:
		if re.match(a, b):
			return SUPERSET

# TODO - Write and figure out what split_regex has to do.
def split_regex(regex):
	pass

class SubsetGraph:
	def __init__(self, tests):
		self.regexps = []
		self.tests = tests
		self.cur_index = 0

	def add_regex(self, new_regex):
		self.regexps.append(RegexElement(new_regex, self.cur_index))
		self.cur_index += 1

	def generate(self, percolate):
		# TODO - Do percolation (will be easy when the bitvector for checked
		#  things works.
		start = datetime.datetime.now()
		comparisons = 0

		for i in range(len(self.regexps)):
			for j in range(i + 1, len(self.regexps)):
				comparisons += 1

				if percolate:
					pass
				else:
					self.__add_relationship__(self.regexps[i], self.regexps[j], \
									self.__cmp__(self.regexps[i], self.regexps[j]),percolate)

		logfile.write("Generate subset graph: %d seconds, %d comparisons\n" % ((datetime.datetime.now() - start).seconds, comparisons))

	def do_closure(self, assume_transitivity):
		# TODO - This is grossly ineficcient. This doesn't work with maybes

		added_count = 0
		passes = 0
		changed = 1

		start = datetime.datetime.now()

		while(changed > 0):
			changed = 0

			for regexA in self.regexps:
				# SUBSET - All supersets of regexA are supersets of regexA's subsets.
				for regexB in regexA.supersets:
					count = len(regexB.subsets)
					regexB.subsets = list(set(regexB.subsets + regexA.subsets))

					if count < len(regexB.subsets):
						changed += len(regexB.subsets) - count

				# SUPERSET - All subsets of regexA are subsets of regexB's supersets.
				# DISJOINTNESS - All subsets of regexA are disjoint from regexA's disjoints.
				# INTERSECTIONS - All regexs that intersect regexA's subsets, also intersect regexA.
				for regexB in regexA.subsets:
					count = len(regexB.supersets + regexB.disjoints + regexA.intersects)
					regexB.supersets = list(set(regexB.supersets + regexA.supersets))
 					regexB.disjoints = list(set(regexB.disjoints + regexA.disjoints))
					regexA.intersects = list(set(regexA.intersects + regexB.intersects))
					if count < len(regexB.supersets + regexB.disjoints + regexA.intersects):
						changed += len(regexB.supersets + regexB.disjoints + regexA.intersects) - count

			added_count += changed
			passes += 1

			if assume_transitivity: break

		logfile.write("Do closure: %d seconds, %d changes, %d passes\n" % ((datetime.datetime.now() - start).seconds, added_count, passes))

	def __add_relationship__(self, a, b, relationship, percolate):
		if random.random() < PROBABILITY_OF_FAILURE and relationship != None:
			relationship = None

		if percolate and a in b.disjoints + b.supersets + b.subsets + b.intersects:
			return

		# Replace maybe with a bitvector for 'checked'
		if relationship == None:
			a.maybes.append(b)
			b.maybes.append(a)
		elif relationship == SUPERSET:
			a.subsets.append(b)
			b.supersets.append(a)
			if percolate:
				for regex in a.supersets:
					self.__add_relationship__(regex, b, SUPERSET)
				for regex in b.subsets:
					self.__add_relationship__(regex, a, SUBSET)
		elif relationship == SUBSET:
			a.supersets.append(b)
			b.subsets.append(a)
			if percolate:
				for regex in a.subsets:
					self.__add_relationship__(regex, b, SUBSET)
				for regex in b.supersets:
					self.__add_relationship__(regex, a, SUPERSET)
		elif relationship == DISJOINT:
			a.disjoints.append(b)
			b.disjoints.append(a)
			if percolate:
				for regex in a.subsets:
					self.__add_relationship__(regex, b, DISJOINT)
				for regex in b.subsets:
					self.__add_relationship__(regex, a, DISJOINT)
		elif relationship == INTERSECT:
			a.intersects.append(b)
			b.intersects.append(a)
		elif relationship == EQUAL:
			a.supersets.append(b)
			b.supersets.append(a)
			a.subsets.append(b)
			b.subsets.append(b)
			if percolate:
				for regex in a.supersets:
					self.__add_relationship__(regex, b, SUPERSET)
				for regex in b.subsets:
					self.__add_relationship__(regex, a, SUBSET)
				for regex in a.subsets:
					self.__add_relationship__(regex, b, SUBSET)
				for regex in b.supersets:
					self.__add_relationship__(regex, a, SUPERSET)

	def __cmp__(self, a, b):
		for test in self.tests:
			result = test(a.string, b.string)
			if result:
				return result			

	def check(self):

		start = datetime.datetime.now()

		for regexA in self.regexps:
			# Something cannot be disjoint and be in any other category:		
			for regexB in regexA.disjoints:
				if regexB in (regexA.intersects + regexA.subsets + regexA.supersets):
					logfile.write("Check: %s cannot intersect and be disjoint from %s\n" % (regexB.string, regexA.string))
			# Something cannot be intersecting and be in any other category:
			for regexB in regexA.intersects:
				if regexB in (regexA.disjoints + regexA.subsets + regexA.supersets):
					logfile.write("Check: %s cannot exclusively intersecting and be anything else to %s\n" % (regexB.string, regexA.string))

		logfile.write("Check: completed in %d seconds\n" % (datetime.datetime.now() - start).seconds)

	def dump_contents(self):
		for regex in self.regexps:
			print "-"*79 + "\n" + regex.string + ":"
			print " intersections:\n  " + "\n  ".join([r.string for r in regex.intersects])
			print " subsets:\n  " + "\n  ".join([r.string for r in regex.subsets])
			print " supersets:\n  " + "\n  ".join([r.string for r in regex.supersets])
			print " disjoints:\n  " + "\n  ".join([r.string for r in regex.disjoints])
			print " maybes:\n  " + "\n  ".join([r.string for r in regex.maybes])

################################# MERGE ######################################

def merge(directory_list, subset_graph, ordering_method):

	total = len(directory_list)

	matching = {}
	
	start = datetime.datetime.now()

	order = ordering_method(subset_graph.regexps)

	num_matched = [0]*len(order)
	num_not_matched = [0]*len(order)


	matched = open("matched.log", "w")
	not_matched = open("not_matched.log", "w")

	for file_name in directory_list:
		match_possibilities = [1]*len(subset_graph.regexps)

		matches = []

#		cur_matched = 0
#		cur_not = 0

		for i in order:
#			num_matched[(cur_matched+cur_not)] += cur_matched
#			num_not_matched[(cur_matched+cur_not)] += cur_not

			if not match_possibilities[i]:
#				cur_not += 1
				continue

#			cur_matched += 1


			element = subset_graph.regexps[i]

			match_possibilities[i] = 0

			if(element.compiled.match(file_name)):
				matches.append(element.string)

				for superset in element.supersets:
					try:
						matches.remove(superset.string)
					except:
						pass

				if i < BRUTE_FORCE_INDEX:
					for other in element.disjoints + element.supersets:
						match_possibilities[other.index] = 0
					
			else:
				if i < BRUTE_FORCE_INDEX:
					for other in element.subsets:
						match_possibilities[other.index] = 0

		matching[file_name] = list(matches)


#	for i in range(len(num_matched)):
#		matched.write(str(i) + " " + str(float(num_matched[i])/len(directory_list)) + "\n")
#		not_matched.write(str(i) + " " + str(float(num_not_matched[i])/len(directory_list)) + "\n")
			

	logfile.write("Finding Solution: %d seconds\n" % ((datetime.datetime.now() - start).seconds))



	return matching

# simply return back the regular order
def simple_ordering(regexps):
	return range(len(regexps))

# Sort the keys in order of best 'middle of the graph'
def order_subset_graph(regexps):

	num_regex = len(regexps)
	list_to_sort = []

	if PRINT_RANKS_TO_FILE:
		balanceout = open("balance.dat", "w")

	for i in range(len(regexps)):

#		off_center = (len(regexps[i].disjoints) + len(regexps[i].supersets)) - (len(regexps[i].subsets) + len(regexps[i].intersects)) * (len(regexps) - (len(regexps[i].disjoints) + len(regexps[i].supersets) + len(regexps[i].subsets) + len(regexps[i].intersects)))
		side1 = len(regexps[i].disjoints) + len(regexps[i].supersets)
		side2 = len(regexps[i].subsets)
		off_center = side1 * side2


		list_to_sort.append((off_center, i))

		if PRINT_RANKS_TO_FILE:
			balanceout.write(str(len(regexps[i].disjoints) + len(regexps[i].supersets)) + " " + str(len(regexps[i].subsets)) + " " + str(off_center) + "\n")


	list_to_sort.sort(reverse=True)


	if GIVE_DISTROBUTION_STATS:
		print "rank \t regex \t\t disjoints \t supersets \t subsets \t intersects"

		for rank, index in list_to_sort:
			if len(regexps[index].string) > 18:
				regex_string = regexps[index].string[:10] + "..." +  regexps[index].string[5:]
			else:
				regex_string = regexps[index].string
			print rank, '\t', regex_string, '\t\t', len(regexps[index].disjoints), '\t', \
				len(regexps[index].supersets), '\t', len(regexps[index].subsets), '\t', len(regexps[index].intersects)


	if PRINT_RANKS_TO_FILE:
		ranksout = open("ranks.dat", "w")
		for i in range(len(list_to_sort)):
			ranksout.write(str(i) + " " + str(list_to_sort[i][0])  + " " + regexps[i].string + "\n")
		

	return [ index for dummy, index in list_to_sort ]





####### DRIVER ###############################################################

regexps = open(sys.argv[2])

if(len(sys.argv)>3):
	logfile = open(sys.argv[3], 'w')
else:
	logfile = open("times.log", 'w')

time_start = datetime.datetime.now()

if not SKIP_DIRECTORY: 
	directories, directories_list = get_directories(False)
else:
	directories_list = get_directories_list()


if SHUFFLE_FILES:
	random.shuffle(directories_list)

# Should be in order of most common to least common, also taking into
#  consideration how long each take.
subsets = SubsetGraph([check_prefixes, check_fixed, check_dotstar, \
								check_suffixes, check_equal])

for line in regexps:
	subsets.add_regex(line.split()[0])

if not SKIP_SUBSET: subsets.generate(FULL_PERCOLATION)

if PERFORM_CHECKS: subsets.check()

if DO_CLOSURE: subsets.do_closure(ASSUME_TRANSITIVITY)

if PERFORM_CHECKS: subsets.check()

if GIVE_OUTPUT: subsets.dump_contents()


merge(directories_list, subsets, order_subset_graph)


logfile.write("Complete program execution: %d seconds\n" % (datetime.datetime.now() - program_start).seconds)


