From df5559fe07c2db463e71b311672a0825bf43de2e Mon Sep 17 00:00:00 2001 From: kirill kirchak <kirchak@sfedu.ru> Date: Tue, 7 May 2024 20:43:02 +0000 Subject: [PATCH] add API and changed compare logic --- clonus/methods/client.py | 70 +++++++++++++++ clonus/methods/counting_time.py | 11 +++ clonus/methods/fp_method_builder.py | 129 +++++++++++++++++++++------- clonus/models.py | 2 + clonus/serializers.py | 9 ++ clonus/views.py | 79 +++++++++++++++-- requirements.txt | 4 +- 7 files changed, 267 insertions(+), 37 deletions(-) create mode 100644 clonus/methods/client.py create mode 100644 clonus/methods/counting_time.py create mode 100644 clonus/serializers.py diff --git a/clonus/methods/client.py b/clonus/methods/client.py new file mode 100644 index 0000000..c277311 --- /dev/null +++ b/clonus/methods/client.py @@ -0,0 +1,70 @@ +from pathlib import Path +import requests +from datetime import datetime + +def get_files_for_compare(path) -> datetime: + main_path = Path(path) + url = 'http://127.0.0.1:8000/api/v1/SummaryAPIView/' # ? + if main_path.exists(): + all_files = [p for p in main_path.rglob("*.py")] + groups_files = group_by(all_files, main_path) + res_time = datetime(2024, 5, 7) + for g in groups_files: + temp_file = g[0] + for k in range(1, len(g) - 1): + for j in range(k + 1, len(g)): + try: + with open(g[k], 'r') as file1, open(g[j], 'r') as file2, open(temp_file, 'r') as file_template: + start_time = datetime.now() + response = requests.post(url, files={'file1': file1, 'file2': file2, + 'file_template': file_template}, + data={'gram_size': 8, 'window_size': 3}) + cur_time = datetime.now() - start_time + res_time += cur_time + write_to_file_json(response.json(), f'{main_path}\\report.txt', + dict({'absolute_path_to_file1': g[k], + 'absolute_path_to_file2': g[j], + 'absolute_path_to_file_template': temp_file})) + except IOError as e: + print("IOError:" + IOError.strerror) + + return res_time + + +def group_by(files, main_path): + ind = len(main_path.parts) + res = [] + if len(files) != 0: + d = dict() + for file in files: + key = "".join(file.parts[ind + 1:]) + if d.get(key) is None: + d[key] = [file] + else: + d[key].append(file) + + for k in d: + sorted(d[k], key=lambda x: 'templates' in x.parts) + res.append(list(d[k])) + return res + + +def write_to_file_json(response, to_path, absolute_path): + t = [set(v.parts) for k, v in absolute_path.items()] + f, s = t[:-1] + names = set(f - s) + names.update(s - f) + names = list(names) + js_info = [response[k] for k in response][0] + reg = '{\\}+' + with open(to_path, 'a+') as report: + report.write(f'Сравнение работ {names[0]} - {names[1]}\n') + report.write("Отчет:\n") + for k, v in js_info.items(): + if k == 'q1': + var = [f'{k}: {v} \n' for k, v in absolute_path.items()] + report.writelines(var) + report.write(f'{k}: {v}\n') + + report.write( + "--------------------------------------------------------------------------------------------------\n") diff --git a/clonus/methods/counting_time.py b/clonus/methods/counting_time.py new file mode 100644 index 0000000..e372066 --- /dev/null +++ b/clonus/methods/counting_time.py @@ -0,0 +1,11 @@ +from datetime import datetime + +from clonus.methods.client import get_files_for_compare + + +def time_compare(): + start_time = datetime.now() + method_time = get_files_for_compare("C:\\example") + res_time = datetime.now() - start_time + print(f'Время сравнения каталога с файлами: {res_time}') + print(f'Время выполнения метода без учета времени затраченного на запись в файл(чистое время): {str(method_time)[10:]}') diff --git a/clonus/methods/fp_method_builder.py b/clonus/methods/fp_method_builder.py index 318dc2b..ae50367 100644 --- a/clonus/methods/fp_method_builder.py +++ b/clonus/methods/fp_method_builder.py @@ -5,16 +5,16 @@ from .fp_method import FpMethodResult class FingerprintMethodBuilder(MethodBuilder): - input_files: list # список из 2 имен файлов для проверки - pre_proc_out: list # выходные данные pre_processing_step - proc_out: list # выходные данные processing_step - post_proc_out: FpMethodResult # выходные данные processing_step - output: list # результат выполнения метода - gram_size: int # размер K-грамы - hash_param: int # параметр хэш-фукнции - window_size: int # размер окна (для алгоритма winnowing) - - def __init__(self, f_list: list, gram_size=8,window_size=3,hash_param=273) -> None: + input_files: list # список из 2 имен файлов для проверки + pre_proc_out: list # выходные данные pre_processing_step + proc_out: list # выходные данные processing_step + post_proc_out: FpMethodResult # выходные данные processing_step + output: list # результат выполнения метода + gram_size: int # размер K-грамы + hash_param: int # параметр хэш-фукнции + window_size: int # размер окна (для алгоритма winnowing) + + def __init__(self, f_list: list, gram_size=8, window_size=3, hash_param=273) -> None: super().__init__() self.input_files = f_list self.pre_proc_out = [] @@ -30,36 +30,96 @@ class FingerprintMethodBuilder(MethodBuilder): self.pre_proc_out.append(token_lst) def processing_step(self): - tokens_text = [] # список токенов в строковом представлении - gh_list = [] # список К-грам и хэш-значений текстов - finger_prints = [] # fingerprints текстов + tokens_text = [] # список токенов в строковом представлении + gh_list = [] # список К-грам и хэш-значений текстов + finger_prints = [] # fingerprints текстов + prints_template = [] + tmp_i_merged_points = [] + if len(self.input_files) != len(self.pre_proc_out): raise AttributeError("Fingerprint method prepocessing error") for elem in self.pre_proc_out: tokens_text.append(self._to_text(elem)) - - for i in range(len(tokens_text)): - i_grams = self._get_k_grams_from_text( - tokens_text[i], self.gram_size, self.hash_param) - i_hashes = self._get_hashes_from_grams(i_grams) - i_finger_prints = self._winnow(i_hashes, self.window_size) - - gh_list.append([i_grams, i_hashes]) - finger_prints.append(i_finger_prints) - + len_lst_token = len(tokens_text)#1723 1846 1865 1947 for i in range(len(tokens_text)): + if i == len_lst_token - 1: + grams_template = self._get_k_grams_from_text(tokens_text[len_lst_token - 1], self.gram_size, + self.hash_param) + hashes_template = self._get_hashes_from_grams(grams_template) + prints_template = self._winnow(hashes_template, self.window_size) + + else: + i_grams = self._get_k_grams_from_text( + tokens_text[i], self.gram_size, self.hash_param) + i_hashes = self._get_hashes_from_grams(i_grams) + i_finger_prints = self._winnow(i_hashes, self.window_size) + gh_list.append([i_grams, i_hashes]) + finger_prints.append(i_finger_prints) + + # -------------------------------list of copy-pairs---------------------------------------- + + for i in range(len(tokens_text) - 1): i_points = self._get_points_lst( finger_prints, self.pre_proc_out[i], gh_list[i][1], gh_list[i][0]) i_merged_points = self._get_merged_points(i_points) - self.proc_out.append(i_merged_points) + + tmp_i_points = self._get_points_lst( + [finger_prints[i], prints_template], self.pre_proc_out[i], gh_list[i][1], gh_list[i][0]) + tmp_i_merged_points.append(self._get_merged_points(tmp_i_points)) + + + res = self.intersection(i_merged_points, tmp_i_merged_points[i]) + self.proc_out.append(res) self.proc_out.append(self._distance_simpson_lst(finger_prints)) def post_processing_step(self) -> FpMethodResult: return FpMethodResult(self.proc_out[0], self.proc_out[1], self.proc_out[2]) + def intersection(self, lst_i, lst_tmp_i): + new_lst = [] + stop_flag = False + start = 0 + check = False + if len(lst_tmp_i) != 0: + for j in range(len(lst_i)): + for i in range(len(lst_tmp_i)): + if lst_i[j][0] > lst_tmp_i[len(lst_tmp_i)-1][0]: + stop_flag = True + start = j + break + if lst_tmp_i[i][0] > lst_i[j][1]: + check = True + break + if lst_tmp_i[i][0] == lst_i[j][0]: + if lst_tmp_i[i][1] == lst_i[j][1]: + check = False + break + if lst_tmp_i[i][1] < lst_i[j][1]: + new_lst.append([lst_tmp_i[i][1] + 1, lst_i[j][1]]) + check = False + break + elif lst_tmp_i[i][0] > lst_i[j][0]: + if lst_tmp_i[i][1] < lst_i[j][1]: + new_lst.extend([[lst_i[j][0], lst_tmp_i[i][0]-1], [lst_tmp_i[i][1]+1, lst_i[j][1]]]) + check = False + break + if lst_tmp_i[i][1] == lst_i[j][1]: + new_lst.append([lst_i[j][0], lst_tmp_i[i][0]-1]) + check = False + break + elif j <= i: + check = True + if check: + new_lst.append(lst_i[j]) + check = False + if stop_flag: + break + new_lst.extend(lst_i[start:]) + return new_lst + def _to_list(self, arr): """Возвращает сами токены в списке""" return [str(x[0]) for x in arr] @@ -98,7 +158,7 @@ class FingerprintMethodBuilder(MethodBuilder): elif tokens[i][0] in pygments.token.Name.Function: res.append(('F', source_cnt, product_cnt)) product_cnt += 1 - elif i!= 0 and tokens[i-1][0] == pygments.token.Text.Whitespace: + elif i != 0 and tokens[i - 1][0] == pygments.token.Text.Whitespace: pass elif not (tokens[i][0] == pygments.token.Text or tokens[i][0] in pygments.token.Comment): res.append((tokens[i][1], source_cnt, product_cnt)) @@ -133,9 +193,9 @@ class FingerprintMethodBuilder(MethodBuilder): def _get_k_grams_from_text(self, text, k=25, q=31): """Разделить текст на K-граммы""" grams = [] - for i in range(0, len(text)-k+1): - hash_gram = self._get_hash_from_gram(text[i:i+k], q) - gram = Gram(text[i:i+k], hash_gram, i, i+k) + for i in range(0, len(text) - k + 1): + hash_gram = self._get_hash_from_gram(text[i:i + k], q) + gram = Gram(text[i:i + k], hash_gram, i, i + k) grams.append(gram) return grams @@ -161,10 +221,10 @@ class FingerprintMethodBuilder(MethodBuilder): n = len(hashes) prints = [] windows = [] - prev_min = 0 + prev_min = -1 current_min = 0 for i in range(n - w): - window = hashes[i:i+w] + window = hashes[i:i + w] windows.append(window) current_min = i + self._min_index(window) if current_min != prev_min: @@ -236,3 +296,12 @@ class Gram: self.hash = hash_gram self.start_pos = start_pos self.end_pos = end_pos + + +class TemplateCompare: + def __init__(self, path, token, hashes, grams, finger_prints): + self.path = path + self.token = token + self.hashes = hashes + self.grams = grams + self.finger_prints = finger_prints diff --git a/clonus/models.py b/clonus/models.py index c0e0339..5fdd638 100644 --- a/clonus/models.py +++ b/clonus/models.py @@ -12,6 +12,7 @@ class Package(models.Model): id: int file1 = models.FilePathField(null=True) file2 = models.FilePathField(null=True) + file_template = models.FilePathField(null=True, default="пока что вручную вписывать")#attention!!! path = models.FilePathField() hash = models.CharField(max_length=32) gram_size = models.PositiveSmallIntegerField(default=8) @@ -35,6 +36,7 @@ class Package(models.Model): rmtree(self.path) + class MultiPackage(models.Model): id: int files = models.FilePathField(null=True) diff --git a/clonus/serializers.py b/clonus/serializers.py new file mode 100644 index 0000000..58870ed --- /dev/null +++ b/clonus/serializers.py @@ -0,0 +1,9 @@ +from rest_framework import serializers +from glob import glob +from clonus.models import Package + + +class PackageSerialiser(serializers.ModelSerializer): + class Meta: + model = Package + fields = ("gram_size", "window_size") diff --git a/clonus/views.py b/clonus/views.py index 6a1dbf8..083ef07 100644 --- a/clonus/views.py +++ b/clonus/views.py @@ -1,20 +1,82 @@ +from django.forms import model_to_dict from django.shortcuts import render, redirect from django.http import HttpRequest from pathlib import Path from collections import deque from itertools import combinations from more_itertools import ilen +from rest_framework.response import Response from clonus.forms import FileToFileForm, ManyFilesForm +from clonus.methods.client import get_files_for_compare +from clonus.methods.counting_time import time_compare from clonus.models import Package, MultiPackage from clonus.methods.fp_method_builder import FingerprintMethodBuilder from clonus.methods.method_configurator import MethodConfigurator +from rest_framework.views import APIView + +from clonus.serializers import PackageSerialiser + # Create your views here. +class SummaryAPIView(APIView): + def get(self, request): + lst = Package.objects.all() + return Response({'res': PackageSerialiser(lst, many=True).data}) + def post (self,request): + + dict_data = {'gram_size': request.data['gram_size'], 'window_size': request.data['window_size']} + serialiser = PackageSerialiser(data=dict_data) + serialiser.is_valid(raise_exception=True) + + obj_package = serialiser.save() + obj_package.mkdir() + + files = request.FILES + i = 0 + for file in files: + f = files[file] + i += 1 + handle_uploaded_file(f, obj_package.path, i) + + obj_package.file1 = f'{obj_package.path}\\[1]{request.FILES["file1"].name}' + obj_package.file2 = f'{obj_package.path}\\[2]{request.FILES["file2"].name}' + obj_package.file_template = f'{obj_package.path}\\[3]{request.FILES["file_template"].name}' + obj_package.gen_hash() + obj_package.save() + + filenames = [obj_package.file1, obj_package.file2, obj_package.file_template] + fp_builder = FingerprintMethodBuilder(filenames, obj_package.gram_size, obj_package.window_size) + config = MethodConfigurator(fp_builder) + method_res = config.make_method() + obj_package.coeff = method_res.clone_pct + obj_package.processed = True + obj_package.save() + + context = { + "hash": obj_package.hash, + "coeff": obj_package.coeff * 100, + "file1": str(obj_package.file1), + "file2": str(obj_package.file2), + "file_template": str(obj_package.file_template), + "f1": Path(obj_package.file1).name, + "f2": Path(obj_package.file2).name, + "date_compare": obj_package.date, + "q1": method_res.clone_parts_1, + "q2": method_res.clone_parts_2, + + } + return Response({"": context}) + +def handle_uploaded_file(f,path, i): + with open(f'{path}\\[{i}]{f.name}', "wb+") as destination: + for chunk in f.chunks(): + destination.write(chunk) def index(request: HttpRequest): + time_compare() return render(request, "index.html") @@ -78,7 +140,11 @@ def process_file(fil: Path, l: "list[list[int]]"): cur = offsets[0][0] end_of_q = False with open(fil, "r", encoding="utf-8") as f: + # print(res) for line in f: + if pos > l[len(l) - 1][1]: + contents += line + continue contents += line line_count += 1 pos += len(line) @@ -93,7 +159,6 @@ def process_file(fil: Path, l: "list[list[int]]"): except IndexError: end_of_q = True res[-1][-1] = line_count - # print(res) return contents, res @@ -103,7 +168,7 @@ def summary(request: HttpRequest, h: str): except Package.DoesNotExist: return redirect("index") - filenames = [p.file1, p.file2] + filenames = [p.file1, p.file2, p.file_template] fp_builder = FingerprintMethodBuilder(filenames, p.gram_size, p.window_size) config = MethodConfigurator(fp_builder) method_res = config.make_method() @@ -121,13 +186,15 @@ def summary(request: HttpRequest, h: str): "file2": file2, "f1": Path(p.file1).name, "f2": Path(p.file2).name, + # "q1": method_res.clone_parts_1, + # "q2": method_res.clone_parts_2 "q1": ", ".join( - "{start: " + str(i) + ", end: " + str(j) + ", color: 'yellow'}" - for i, j in q1 + "{start: " + str(i) + ", end: " + str(j) + ", color: 'yellow'}" + for i, j in q1 ), "q2": ", ".join( - "{start: " + str(i) + ", end: " + str(j) + ", color: 'yellow'}" - for i, j in q2 + "{start: " + str(i) + ", end: " + str(j) + ", color: 'yellow'}" + for i, j in q2 ), } return render(request, "summary.html", context) diff --git a/requirements.txt b/requirements.txt index 09fb36b..36d11d0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,6 @@ django-bootstrap-v5 crispy-bootstrap5 django-crispy-forms more-itertools -Pygments \ No newline at end of file +djangorestframework +Pygments +requests -- GitLab