Skip to content
Snippets Groups Projects
Commit 52eb703e authored by Andrey's avatar Andrey
Browse files

better naming and minor optimization

parent b8619e0b
Branches
No related merge requests found
class Method_Result():
class Fp_Method_Result():
clone_parts_1 : list # части с заимствованиями из первого файла
clone_parts_2 : list # части с заимствованиями из второго файла
clone_pct : float # процент заимствований
......
from method_builder import Method_Builder
from fp_method import Method_Result
import pygments.token,pygments.lexers
import hashlib
import pygments.token
import pygments.lexers
from method_builder import Method_Builder
from fp_method import Fp_Method_Result
class Fingerprint_Method_Builder(Method_Builder):
input : list # list of filenames of texts (new preposesing?)
pre_p : list # pre_processing_step result = token1
p: list # processing_step result
post_p : object # processing_step result
output : list # method_result
def __init__(self,input : str) -> None:
class Fingerprint_Method_Builder(Method_Builder):
input_files: list # list of 2 filenames to detect clones
pre_proc_out: list # pre_processing_step output
proc_out: list # processing_step output
post_proc_out: Fp_Method_Result # processing_step output
output: list # method_result
gram_size: int # size of K-gram
hash_param: int # hash_function parameter
window_size: int # size of window (for winnowing)
def __init__(self, f_list: list, gram_size=8, hash_param=259, window_size=3) -> None:
super().__init__()
self.input = input # add other fields ?
self.pre_p = []
self.p = []
self.post_p = []
def pre_processing_step(self) -> None:
for st in self.input:
token_lst=self._tokenize(st) # ((token), sorcepos, proccespos)
self.pre_p.append(token_lst)
self.input_files = f_list
self.pre_proc_out = []
self.proc_out = []
self.post_proc_out = []
self.gram_size = gram_size
self.hash_param = hash_param
self.window_size = window_size
def pre_processing_step(self):
for file_name in self.input_files:
token_lst = self._tokenize(file_name)
self.pre_proc_out.append(token_lst)
def processing_step(self):
if len(self.input) != len(self.pre_p):
raise AttributeError("Fingerprint method prepocessing error")
k=8 #вынести в редактор
q=259
w=3
text_tokens = [] # token1 and token2 (text1proc and textproc2)
tokens_text = [] # tokens in string notation
gh_list = [] # list of texts' grams and hashes
finger_prints = [] # texts' fingerprints
for elem in self.pre_p:
text_tokens.append(self._to_text(elem))
grams1 = self._get_k_grams_from_text(text_tokens[0], k, q)
grams2 = self._get_k_grams_from_text(text_tokens[1], k, q)
hashes1 = self._get_hashes_from_grams(grams1)
hashes2 = self._get_hashes_from_grams(grams2)
if len(self.input_files) != len(self.pre_proc_out):
raise AttributeError("Fingerprint method prepocessing error")
for elem in self.pre_proc_out:
tokens_text.append(self._to_text(elem))
fp1 = self._winnow(hashes1, w)
fp2 = self._winnow(hashes2, w)
for i in range(len(tokens_text)):
i_grams = self._get_k_grams_from_text(
tokens_text[i], self.gram_size, self.hash_param)
i_hashes = self._get_hashes_from_grams(i_grams)
i_finger_prints = self._winnow(i_hashes, self.window_size)
points1 = self._get_points(fp1, fp2, self.pre_p[0], hashes1, grams1)
points2 = self._get_points(fp2, fp1, self.pre_p[1], hashes2, grams2)
gh_list.append([i_grams, i_hashes])
finger_prints.append(i_finger_prints)
merged_points1 = self._get_merged_points(points1)
merged_points2 = self._get_merged_points(points2)
#self.p.append([merged_points1, merged_points2, self._distance_simpson(fp1, fp2)])
self.p.append(merged_points1)
self.p.append(merged_points2)
self.p.append(self._distance_simpson(fp1, fp2))
for i in range(len(tokens_text)):
i_points = self._get_points_lst(
finger_prints, self.pre_proc_out[i], gh_list[i][1], gh_list[i][0])
i_merged_points = self._get_merged_points(i_points)
self.proc_out.append(i_merged_points)
# for i in range(0,len(tokens)):
# grams = self._get_k_grams_from_text(tokens[i], k, q)
# hashes = self._get_hashes_from_grams(grams)
# fp = self._winnow(hashes, w)
self.proc_out.append(self._distance_simpson_lst(finger_prints))
# points1 = self._get_points(fp1, fp2, token1, hashes1, grams1)
# points2 = self._get_points(fp1, fp2, token2, hashes2, grams2)
# merged_points1 =self._get_merged_points_extended(points1)
# merged_points2 = self._get_merged_points_extended(points2)
def post_processing_step(self) -> Method_Result:
return Method_Result (self.p[0],self.p[1],self.p[2])
#print(self.p)
def post_processing_step(self) -> Fp_Method_Result:
return Fp_Method_Result(self.proc_out[0], self.proc_out[1], self.proc_out[2])
#Возвращает только токены в списке
def _to_list(self,arr):
# Возвращает сами токены в списке
def _to_list(self, arr):
return [str(x[0]) for x in arr]
#Возвращает только токены в строке
def _to_text(self,arr):
# Возвращает сами токены в строке
def _to_text(self, arr):
cleanText = ''.join(str(x[0]) for x in arr)
return cleanText
#алгоритм токенизации
def _tokenize(self,st : str) -> None:
file = open(st, "r")
# Алгоритм токенизации с запоминанием позиции
def _tokenize(self, filename: str) -> None:
file = open(filename, "r")
text = file.read()
file.close()
lexer = pygments.lexers.guess_lexer_for_filename(st, text) # для определения языка
lexer = pygments.lexers.guess_lexer_for_filename(
filename, text) # для определения языка
tokens = lexer.get_tokens(text)
tokens = list(tokens)
result = []
lenT = len(tokens)
source_cnt = 0 #позиция элемента в исходном коде
product_cnt = 0 #позиция элемента в обработаном коде
for i in range(lenT):
if tokens[i][0] == pygments.token.Name and not i == lenT - 1 and not tokens[i + 1][1] == '(':
result.append(('N', source_cnt, product_cnt))
res = []
tokens_len = len(tokens)
source_cnt = 0 # позиция элемента в исходном коде
product_cnt = 0 # позиция элемента в обработаном коде
for i in range(tokens_len):
if tokens[i][0] == pygments.token.Name and i != tokens_len - 1 and tokens[i + 1][1] != '(':
res.append(('N', source_cnt, product_cnt))
product_cnt += 1
elif tokens[i][0] in pygments.token.Literal.String:
result.append(('S', source_cnt, product_cnt))
res.append(('S', source_cnt, product_cnt))
product_cnt += 1
elif tokens[i][0] in pygments.token.Name.Function:
result.append(('F', source_cnt, product_cnt))
res.append(('F', source_cnt, product_cnt))
product_cnt += 1
elif tokens[i][0] == pygments.token.Text or tokens[i][0] in pygments.token.Comment:
pass
else:
result.append((tokens[i][1], source_cnt, product_cnt))
elif not (tokens[i][0] == pygments.token.Text or tokens[i][0] in pygments.token.Comment):
res.append((tokens[i][1], source_cnt, product_cnt))
product_cnt += len(tokens[i][1])
source_cnt += len(tokens[i][1])
return result
return res
#возращает_текст_из_файла_(не_используется)
def _get_text_from_file(self,filename):
# Возвращает текст из файла
def _get_text_from_file(self, filename):
with open(filename, 'r') as f:
text = f.read().lower()
return text
#возвращает_текст_без_стоп-символов(не_используется)
def _get_text_processing(self,text):
# возвращает текст без стоп-символов
def _get_text_processing(self, text):
stop_symbols = [' ', ',']
return ''.join(j for j in text if not j in stop_symbols)
return ''.join(j for j in text if j not in stop_symbols)
#хеш_функция_от_куска_текста (сделать разыне хэш-функции??)
def _get_hash_from_gram(self,gram, q):
# Хеш-функция от куска текста 1
def _get_hash_from_gram(self, gram, q):
h = 0
k = 273
mod = 10 ** 9 + 7 # 2**64
mod = 10 ** 9 + 7
m = 1
for letter in gram:
x = ord(letter) - ord('a') + 1
......@@ -135,15 +124,15 @@ class Fingerprint_Method_Builder(Method_Builder):
m = (m * k) % mod
return h
#хеш_функция_от_куска_текста (сделать разыне хэш-функции??)
def _get_hash_from_gram1(self,gram, q):
# Хеш-функция от куска текста 2
def _get_hash_from_gram1(self, gram, q):
hashval = hashlib.sha1(gram.encode('utf-8'))
hashval = hashval.hexdigest()[-4:]
hashval = int(hashval, 16)
return hashval
#разделить_текст_на_K-граммы
def _get_k_grams_from_text(self,text, k=25, q=31):
# Разделить текст на K-граммы
def _get_k_grams_from_text(self, text, k=25, q=31):
grams = []
for i in range(0, len(text)-k+1):
hash_gram = self._get_hash_from_gram(text[i:i+k], q)
......@@ -151,25 +140,25 @@ class Fingerprint_Method_Builder(Method_Builder):
grams.append(gram)
return grams
#взять_хэши_от_списка_граммов
def _get_hashes_from_grams(self,grams):
# Взять хэши от списка граммов
def _get_hashes_from_grams(self, grams):
hashes = []
for gram in grams:
hashes.append(gram.hash)
return hashes
#индекс_миниамльного_значения_в_окне(для_winnowing)
def _min_index(self,window):
# Индекс минимального значения в окне
def _min_index(self, window):
min_ = window[0]
min_i = 0
for i in range(0,len(window)):
for i in range(0, len(window)):
if window[i] < min_:
min_ = window[i]
min_i = i
return min_i
#метод_проссеивания_отпечатков(шаг_2)
def _winnow(self,hashes, w):
# Метод просеивания отпечатков
def _winnow(self, hashes, w):
n = len(hashes)
prints = []
windows = []
......@@ -184,14 +173,20 @@ class Fingerprint_Method_Builder(Method_Builder):
prev_min = current_min
return prints
#Мера Шимкевича_Симпсона
def _distance_simpson(self,A, B):
# Мера Шимкевича_Симпсона
def _distance_simpson(self, A, B):
a = set(A)
b = set(B)
return len(a.intersection(b)) / min(len(a), len(b))
#найти похожие fingerprints
def _get_points(self,fp1, fp2, token, hashes, grams):
# Мера Шимкевича_Симпсона (лист)
def _distance_simpson_lst(self, lst):
a = set(lst[0])
b = set(lst[1])
return len(a.intersection(b)) / min(len(a), len(b))
# Найти похожие fingerprints
def _get_points(self, fp1, fp2, token, hashes, grams):
points = []
for i in fp1:
for j in fp2:
......@@ -203,7 +198,7 @@ class Fingerprint_Method_Builder(Method_Builder):
newEnd = grams[match].end_pos
for k in token:
if k[2] == newStart:
if k[2] == newStart:
startx = k[1]
flag = True
if k[2] == newEnd:
......@@ -211,11 +206,15 @@ class Fingerprint_Method_Builder(Method_Builder):
if flag and endx is not None:
if [startx, endx] not in points:
points.append([startx, endx])
points.sort(key = lambda x: x[0])
points.sort(key=lambda x: x[0])
return points
#склеить похожие fingerprints
def _get_merged_points(self,points):
# Найти похожие fingerprints из листа
def _get_points_lst(self, lst, token, hashes, grams):
return (self._get_points(lst[0], lst[1], token, hashes, grams))
# Склеить похожие fingerprints
def _get_merged_points(self, points):
mergedPoints = []
mergedPoints.append(points[0])
for i in range(1, len(points)):
......@@ -227,7 +226,6 @@ class Fingerprint_Method_Builder(Method_Builder):
else:
mergedPoints.append(points[i])
return mergedPoints
class Gram:
......@@ -235,4 +233,4 @@ class Gram:
self.text = text
self.hash = hash_gram
self.start_pos = start_pos
self.end_pos = end_pos
\ No newline at end of file
self.end_pos = end_pos
from fp_method_builder import *
from method_configurator import *
from fp_method import Method_Result
from fp_method_builder import Fingerprint_Method_Builder
from method_configurator import Method_Configurator
def main():
filenames = ['./Tests/Python/test1.py','./Tests/Python/test2.py']
......
from abc import ABC, abstractmethod
class Method_Builder(ABC):
#input : str # filename or text
# def __init__(self, input:str) -> None:
# super().__init__()
# self.input = input
@abstractmethod
def pre_processing_step(self):
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment