Skip to content
Snippets Groups Projects
Unverified Commit 4bd1ca70 authored by Andrey Nureev's avatar Andrey Nureev Committed by GitHub
Browse files

Merge pull request #1 from Urdeney/detection_methods

Detection methods
parents 4ed2d29b 327a6246
No related merge requests found
Showing
with 787 additions and 0 deletions
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
.vscode
# files to process
/packages
\ No newline at end of file
"""
ASGI config for Diploma project.
It exposes the ASGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/4.2/howto/deployment/asgi/
"""
import os
from django.core.asgi import get_asgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'Diploma.settings')
application = get_asgi_application()
"""
Django settings for Diploma project.
Generated by 'django-admin startproject' using Django 4.2.1.
For more information on this file, see
https://docs.djangoproject.com/en/4.2/topics/settings/
For the full list of settings and their values, see
https://docs.djangoproject.com/en/4.2/ref/settings/
"""
from pathlib import Path
# Build paths inside the project like this: BASE_DIR / 'subdir'.
BASE_DIR = Path(__file__).resolve().parent.parent
# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = 'django-insecure-!tw@q!z46i1z+!-s*&p1!6un%@+7)1#9kp*(1tx5r3m_@3*794'
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
ALLOWED_HOSTS = []
# Application definition
INSTALLED_APPS = [
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
'clonus',
'crispy_forms',
'crispy_bootstrap5',
'bootstrap5',
]
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
]
ROOT_URLCONF = 'Diploma.urls'
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [],
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
'django.template.context_processors.debug',
'django.template.context_processors.request',
'django.contrib.auth.context_processors.auth',
'django.contrib.messages.context_processors.messages',
],
},
},
]
WSGI_APPLICATION = 'Diploma.wsgi.application'
# Database
# https://docs.djangoproject.com/en/4.2/ref/settings/#databases
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': BASE_DIR / 'db.sqlite3',
}
}
# Password validation
# https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators
AUTH_PASSWORD_VALIDATORS = [
{
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
},
]
# Internationalization
# https://docs.djangoproject.com/en/4.2/topics/i18n/
LANGUAGE_CODE = 'ru-ru'
TIME_ZONE = 'Europe/Moscow'
USE_I18N = True
USE_L10N = True
USE_TZ = True
# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/4.2/howto/static-files/
STATIC_URL = 'static/'
STATICFILES_DIRS = [
BASE_DIR / 'static'
]
# Default primary key field type
# https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
CRISPY_ALLOWED_TEMPLATE_PACKS = "bootstrap5"
CRISPY_TEMPLATE_PACK = "bootstrap5"
"""
URL configuration for Diploma project.
The `urlpatterns` list routes URLs to views. For more information please see:
https://docs.djangoproject.com/en/4.2/topics/http/urls/
Examples:
Function views
1. Add an import: from my_app import views
2. Add a URL to urlpatterns: path('', views.home, name='home')
Class-based views
1. Add an import: from other_app.views import Home
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
Including another URLconf
1. Import the include() function: from django.urls import include, path
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
"""
from django.conf import settings
from django.conf.urls.static import static
from django.contrib import admin
from django.urls import path
from clonus import views
urlpatterns = [
path("admin/", admin.site.urls),
path("", views.index, name="index"),
path("compare/", views.compare, name="compare"),
path("compare_many/", views.compare_many, name="compare_many"),
path("summary/<str:h>", views.summary, name="summary"),
path("summary_many/<str:h>", views.summary_many, name="summary_many"),
path("list/", views.list, name="list")
] + static(settings.STATIC_URL, document_root=settings.STATIC_ROOT)
"""
WSGI config for Diploma project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/4.2/howto/deployment/wsgi/
"""
import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'Diploma.settings')
application = get_wsgi_application()
from django.contrib import admin
# Register your models here.
from django.apps import AppConfig
class ClonusConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'clonus'
from django import forms
from crispy_forms.helper import FormHelper
from crispy_forms.layout import Fieldset, Layout, Submit
class FileToFileForm(forms.Form):
file1 = forms.FileField(label="Файл 1")
file2 = forms.FileField(label="Файл 2")
gram_size = forms.IntegerField(
label="Размер K-граммы", initial=8, min_value=0, max_value=256
)
window_size = forms.IntegerField(
label="Размер окна", initial=3, min_value=0, max_value=32
)
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.helper = FormHelper()
self.helper.layout = Layout(
"file1",
"file2",
"gram_size",
"window_size",
Submit("submit", "Обработать", css_class="btn btn-warning btn-lg btn-m"),
)
class MFInput(forms.ClearableFileInput):
allow_multiple_selected = True
class MFField(forms.FileField):
def __init__(self, *args, **kwargs):
kwargs.setdefault("widget", MFInput())
super().__init__(*args, **kwargs)
def clean(self, data, initial=None):
single_file_clean = super().clean
if isinstance(data, (list, tuple)):
result = [single_file_clean(d, initial) for d in data]
else:
result = single_file_clean(data, initial)
return result
class ManyFilesForm(forms.Form):
files = MFField(label="Файлы")
gram_size = forms.IntegerField(
label="Размер K-граммы", initial=8, min_value=0, max_value=256
)
window_size = forms.IntegerField(
label="Размер окна", initial=3, min_value=0, max_value=32
)
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.helper = FormHelper()
self.helper.layout = Layout(
"files",
"gram_size",
"window_size",
Submit("submit", "Обработать", css_class="btn btn-warning btn-lg btn-m"),
)
from django.core import management
from django.core.management.base import BaseCommand
from shutil import rmtree
from pathlib import Path
class Command(BaseCommand):
help = 'Clear db, remove packages'
# pylint: broad-exception-caught
def handle(self, *args, **options):
try:
self.stdout.write(self.style.WARNING('Removing packages...'))
for p in Path('packages').iterdir():
self.stdout.write(f'Deleting {p}...')
rmtree(p)
self.stdout.write(self.style.WARNING('Removing database entries...'))
management.call_command('migrate', 'clonus', 'zero')
self.stdout.write(self.style.WARNING('Migrating database...'))
management.call_command('migrate', 'clonus')
self.stdout.write(self.style.SUCCESS('Done'))
except Exception as exc:
self.stdout.write(self.style.ERROR(exc))
\ No newline at end of file
class FpMethodResult():
clone_parts_1: list # части с заимствованиями из первого файла
clone_parts_2: list # части с заимствованиями из второго файла
clone_pct: float # процент заимствований
def __init__(self, cl_pt1, cl_pt2, clp_pct) -> None:
self.clone_parts_1 = cl_pt1
self.clone_parts_2 = cl_pt2
self.clone_pct = clp_pct
def print(self) -> None:
print(self.clone_parts_1)
print(self.clone_parts_2)
print(self.clone_pct)
import pygments.token
import pygments.lexers
from .method_builder import MethodBuilder
from .fp_method import FpMethodResult
class FingerprintMethodBuilder(MethodBuilder):
input_files: list # список из 2 имен файлов для проверки
pre_proc_out: list # выходные данные pre_processing_step
proc_out: list # выходные данные processing_step
post_proc_out: FpMethodResult # выходные данные processing_step
output: list # результат выполнения метода
gram_size: int # размер K-грамы
hash_param: int # параметр хэш-фукнции
window_size: int # размер окна (для алгоритма winnowing)
def __init__(self, f_list: list, gram_size=8,window_size=3,hash_param=273) -> None:
super().__init__()
self.input_files = f_list
self.pre_proc_out = []
self.proc_out = []
self.post_proc_out = []
self.gram_size = gram_size
self.hash_param = hash_param
self.window_size = window_size
def pre_processing_step(self):
for file_name in self.input_files:
token_lst = self._tokenize(file_name)
self.pre_proc_out.append(token_lst)
def processing_step(self):
tokens_text = [] # список токенов в строковом представлении
gh_list = [] # список К-грам и хэш-значений текстов
finger_prints = [] # fingerprints текстов
if len(self.input_files) != len(self.pre_proc_out):
raise AttributeError("Fingerprint method prepocessing error")
for elem in self.pre_proc_out:
tokens_text.append(self._to_text(elem))
for i in range(len(tokens_text)):
i_grams = self._get_k_grams_from_text(
tokens_text[i], self.gram_size, self.hash_param)
i_hashes = self._get_hashes_from_grams(i_grams)
i_finger_prints = self._winnow(i_hashes, self.window_size)
gh_list.append([i_grams, i_hashes])
finger_prints.append(i_finger_prints)
for i in range(len(tokens_text)):
i_points = self._get_points_lst(
finger_prints, self.pre_proc_out[i], gh_list[i][1], gh_list[i][0])
i_merged_points = self._get_merged_points(i_points)
self.proc_out.append(i_merged_points)
self.proc_out.append(self._distance_simpson_lst(finger_prints))
def post_processing_step(self) -> FpMethodResult:
return FpMethodResult(self.proc_out[0], self.proc_out[1], self.proc_out[2])
def _to_list(self, arr):
"""Возвращает сами токены в списке"""
return [str(x[0]) for x in arr]
def _to_text(self, arr):
"""Возвращает сами токены в строке"""
cleanText = ''.join(str(x[0]) for x in arr)
return cleanText
def _tokenize(self, filename: str) -> None:
"""Алгоритм токенизации с запоминанием позиции"""
file = open(filename, "r")
text = file.read()
file.close()
lexer = pygments.lexers.guess_lexer_for_filename(
filename, text) # для определения языка
tokens = lexer.get_tokens(text)
tokens = list(tokens)
res = []
tokens_len = len(tokens)
source_cnt = 0 # позиция элемента в исходном коде
product_cnt = 0 # позиция элемента в обработаном коде
for i in range(tokens_len):
if tokens[i][0] == pygments.token.Name and i != tokens_len - 1 and tokens[i + 1][1] != '(':
res.append(('N', source_cnt, product_cnt))
product_cnt += 1
elif tokens[i][0] in pygments.token.Literal.String:
res.append(('S', source_cnt, product_cnt))
product_cnt += 1
elif tokens[i][0] in pygments.token.Literal.Number.Integer:
res.append(('I', source_cnt, product_cnt))
product_cnt += 1
elif tokens[i][0] in pygments.token.Literal.Number.Float:
res.append(('D', source_cnt, product_cnt))
product_cnt += 1
elif tokens[i][0] in pygments.token.Name.Function:
res.append(('F', source_cnt, product_cnt))
product_cnt += 1
elif i!= 0 and tokens[i-1][0] == pygments.token.Text.Whitespace:
pass
elif not (tokens[i][0] == pygments.token.Text or tokens[i][0] in pygments.token.Comment):
res.append((tokens[i][1], source_cnt, product_cnt))
product_cnt += len(tokens[i][1])
source_cnt += len(tokens[i][1])
return res
def _get_text_from_file(self, filename):
"""Возвращает текст из файла"""
with open(filename, 'r') as f:
text = f.read().lower()
return text
def _get_text_processing(self, text):
"""Возвращает текст без стоп-символов"""
stop_symbols = [' ', ',']
return ''.join(j for j in text if j not in stop_symbols)
def _get_hash_from_gram(self, gram, q):
"""Полиномиальная хеш-функция от фрагмента текста"""
h = 0
mod = 10 ** 9 + 7
m = 1
for letter in gram:
x = ord(letter) - ord('a') + 1
h = (h + m * x) % mod
m = (m * q) % mod
return h
def _get_k_grams_from_text(self, text, k=25, q=31):
"""Разделить текст на K-граммы"""
grams = []
for i in range(0, len(text)-k+1):
hash_gram = self._get_hash_from_gram(text[i:i+k], q)
gram = Gram(text[i:i+k], hash_gram, i, i+k)
grams.append(gram)
return grams
def _get_hashes_from_grams(self, grams):
"""Взять хэши от списка граммов"""
hashes = []
for gram in grams:
hashes.append(gram.hash)
return hashes
def _min_index(self, window):
"""Индекс минимального значения в окне"""
min_ = window[0]
min_i = 0
for i in range(0, len(window)):
if window[i] < min_:
min_ = window[i]
min_i = i
return min_i
def _winnow(self, hashes, w):
"""Метод просеивания отпечатков"""
n = len(hashes)
prints = []
windows = []
prev_min = 0
current_min = 0
for i in range(n - w):
window = hashes[i:i+w]
windows.append(window)
current_min = i + self._min_index(window)
if current_min != prev_min:
prints.append(hashes[current_min])
prev_min = current_min
return prints
def _distance_simpson(self, A, B):
"""Мера Шимкевича_Симпсона"""
a = set(A)
b = set(B)
return len(a.intersection(b)) / min(len(a), len(b))
def _distance_simpson_lst(self, lst):
"""Мера Шимкевича_Симпсона из списка"""
a = set(lst[0])
b = set(lst[1])
return len(a.intersection(b)) / min(len(a), len(b))
def _get_points(self, fp1, fp2, token, hashes, grams):
"""Найти похожие fingerprints"""
points = []
for i in fp1:
for j in fp2:
if i == j:
flag = False
startx = endx = None
match = hashes.index(i)
newStart = grams[match].start_pos
newEnd = grams[match].end_pos
for k in token:
if k[2] == newStart:
startx = k[1]
flag = True
if k[2] == newEnd:
endx = k[1]
if flag and endx is not None:
if [startx, endx] not in points:
points.append([startx, endx])
points.sort(key=lambda x: x[0])
return points
def _get_points_lst(self, lst, token, hashes, grams):
"""Найти похожие fingerprints из листа"""
return (self._get_points(lst[0], lst[1], token, hashes, grams))
def _get_merged_points(self, points):
"""Склеить похожие fingerprints"""
try:
merged_points = []
merged_points.append(points[0])
for i in range(1, len(points)):
last = merged_points[-1]
if points[i][0] >= last[0] and points[i][0] <= last[1]:
if points[i][1] > last[1]:
merged_points = merged_points[:-1]
merged_points.append([last[0], points[i][1]])
else:
merged_points.append(points[i])
return merged_points
except IndexError: # если нет совпадающих отпечатков
return None
class Gram:
def __init__(self, text, hash_gram, start_pos, end_pos):
self.text = text
self.hash = hash_gram
self.start_pos = start_pos
self.end_pos = end_pos
from .fp_method_builder import FingerprintMethodBuilder
from .method_configurator import MethodConfigurator
def main():
filenames = ['./clonus/Tests/Python/test1.py', './clonus/Tests/Python/test2.py']
fp_builder = FingerprintMethodBuilder(filenames)
config = MethodConfigurator(fp_builder)
res = config.make_method()
res.print()
main()
from abc import ABC, abstractmethod
class MethodBuilder(ABC):
@abstractmethod
def pre_processing_step(self):
...
@abstractmethod
def processing_step(self):
...
@abstractmethod
def post_processing_step(self):
...
from .method_builder import MethodBuilder
from .fp_method import FpMethodResult
class MethodConfigurator():
builder: MethodBuilder
def __init__(self, builder: MethodBuilder) -> None:
super().__init__()
self.builder = builder
if self.builder is None:
raise AttributeError("Builder cannot be empty")
def change_builder(self, new_builder: MethodBuilder):
self.builder = new_builder
def make_method(self)-> FpMethodResult:
self.builder.pre_processing_step()
self.builder.processing_step()
return self.builder.post_processing_step()
# Generated by Django 4.2.1 on 2023-05-23 19:28
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = [
]
operations = [
migrations.CreateModel(
name='Package',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('file1', models.FilePathField(null=True)),
('file2', models.FilePathField(null=True)),
('path', models.FilePathField()),
('hash', models.CharField(max_length=32)),
('gram_size', models.PositiveSmallIntegerField(default=8)),
('window_size', models.PositiveSmallIntegerField(default=3)),
('coeff', models.FloatField(default=0.0)),
('date', models.DateTimeField(auto_now_add=True)),
],
),
]
# Generated by Django 4.2.1 on 2023-06-10 14:53
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('clonus', '0001_initial'),
]
operations = [
migrations.CreateModel(
name='MultiPackage',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('files', models.FilePathField(null=True)),
('path', models.FilePathField()),
('hash', models.CharField(max_length=32)),
('gram_size', models.PositiveSmallIntegerField(default=8)),
('window_size', models.PositiveSmallIntegerField(default=3)),
('coeff', models.FloatField(default=0.0)),
('date', models.DateTimeField(auto_now_add=True)),
],
),
]
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment