Skip to content

Commit

Permalink
Merge pull request #6 from alexandrosraikos/2.0
Browse files Browse the repository at this point in the history
2.0 Restructuring - This is an update on structure and semantics for the package. Objective logic was more closely followed to allow for further extension of the parsing capabilities.
  • Loading branch information
Alexandros Raikos authored Apr 26, 2021
2 parents 4146c5c + 4157ceb commit 5c97651
Show file tree
Hide file tree
Showing 5 changed files with 241 additions and 157 deletions.
152 changes: 65 additions & 87 deletions dextractor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,14 @@
import os
from typing import Set

# Third party deependencies.
# Third party dependencies.
import colorama
from colorama import Fore

# Local dependencies.
from .expressions import known
from .exclusions import ignored_files, ignored_extensions
from .src.parser import SourceFile
from .src.exclusions import ignored_files, ignored_extensions


def analyse(
any_path: str,
Expand Down Expand Up @@ -44,68 +45,9 @@ def analyse(
coverage_counter = 0
ignored_counter = 0

# - 0.2 Initialise colorama
# - 0.2 Initialise colorama.
colorama.init(autoreset=True)

# - 0.3. Single file dependency analysis module.
def find_in(file_path: str) -> Set:
"""
Read the source file and extract imported package names using regular expressions.
"""
# 0. Initialize empty set of discovered dependencies.
found = set()
nonlocal coverage_counter
nonlocal ignored_counter
nonlocal strict
nonlocal verbose

# 1. When file is written in a supported language.
filename, extension = os.path.splitext(file_path)
if extension in known:
# 1.1. When file isn't too large.
# -----
# NOTE: Most source files for most use cases are not
# expected to exceed 5MB in size (editable).
if os.stat(file_path).st_size < max_file_size:
try:
# 1.1.1. Open file for reading.
file = open(file_path, "r")
if verbose:
print("[dextractor]", end=" ")
print(Fore.CYAN + "INFORMATION:", end=" ")
print(f"Reading {os.path.basename(file.name)}")

# 1.1.2. Match regex and obtain named capture group.
if strict:
query = known[extension + "-strict"]
else:
query = known[extension]
matches = query.findall(file.read())
found.update(matches)

if not found and verbose:
print("[dextractor]", end=" ")
print(Fore.CYAN + "INFORMATION:", end=" ")
print("This file doesn't include any dependencies.")
# 1.1.3. Close file for memory optimisation.
file.close()
except IOError:
print("[dextractor]", end=" ")
print(Fore.RED + "ERROR:", end=" ")
print(
f"There was an IO error when trying to access the file '{filename}'."
)
else:
raise MemoryError()
elif os.path.basename(filename) in ignored_files or extension in ignored_extensions:
ignored_counter += 1
raise TypeError()
else:
raise NotImplementedError()
# 2. Increment directory coverage counter and return list of found dependencies.
coverage_counter += 1
return found

# 0. Initialise empty dependencies array.
dependencies = set()

Expand All @@ -118,12 +60,27 @@ def find_in(file_path: str) -> Set:
# 1.1.1. Traverse all available files.
for file in files:
try:
dependencies.update(find_in(os.path.join(root, file)))
# 1.1.2. Check for supported language and size.
if os.stat(os.path.join(root, file)).st_size < max_file_size:
if (os.path.splitext(file)[0] not in ignored_files) and (
os.path.splitext(file)[1] not in ignored_extensions
):
# 1.1.3. Extract dependencies.
source_file = SourceFile(os.path.join(root, file))
dependencies.update(
source_file.dependencies(verbose, strict)
)
coverage_counter += 1
else:
ignored_counter += 1
raise TypeError
else:
raise MemoryError
except TypeError:
if verbose:
print("[dextractor]", end=" ")
print(Fore.YELLOW + "NOTICE:", end=" ")
print(f"The file '{file}' is not a source file.")
print(f"The file '{file}' does not contain source code.")
except NotImplementedError:
if verbose:
print("[dextractor]", end=" ")
Expand All @@ -137,12 +94,10 @@ def find_in(file_path: str) -> Set:
except IOError:
print("[dextractor]", end=" ")
print(Fore.RED + "ERROR:", end=" ")
print(
f"The file '{file}' could not be accessed."
)
# 1.1.2 Update total file count.
print(f"The file '{file}' could not be accessed.")
# 1.1.4 Update total file count.
total_file_count += len(files)
# 1.1.3. Extract statistics.
# 1.1.5. Extract statistics.
if len(files) > 0 and coverage_counter > 0:
print("[dextractor]", end=" ")
print(Fore.GREEN + "SUCCESS:")
Expand All @@ -166,27 +121,50 @@ def find_in(file_path: str) -> Set:
# running the script for a single file.
filename, extension = os.path.splitext(any_path)
try:
dependencies = dependencies.union(find_in(any_path))
source_file = SourceFile(any_path)
dependencies.update(source_file.dependencies(verbose, strict))
# 1.1.2. Check for supported language and size.
if os.stat(any_path).st_size < max_file_size:
if (os.path.splitext(source_file)[0] not in ignored_files) and (
os.path.splitext(source_file)[1] not in ignored_extensions
):
# 1.1.3. Extract dependencies.
source_file = SourceFile(any_path)
dependencies.update(source_file.dependencies(verbose, strict))
coverage_counter += 1
else:
ignored_counter += 1
raise TypeError
else:
raise MemoryError
except TypeError:
if verbose:
print("[dextractor]", end=" ")
print(Fore.RED + "ERROR:", end=" ")
print(f"The file '{os.path.basename(filename)}{extension}' is not a source file.")
if verbose:
print("[dextractor]", end=" ")
print(Fore.RED + "ERROR:", end=" ")
print(
f"The file '{os.path.basename(filename)}{extension}' is not a source file."
)
except NotImplementedError:
if verbose:
print("[dextractor]", end=" ")
print(Fore.YELLOW + "NOTICE:", end=" ")
print(f"The file '{os.path.basename(filename)}{extension}' is not yet supported by this module.")
if verbose:
print("[dextractor]", end=" ")
print(Fore.YELLOW + "NOTICE:", end=" ")
print(
f"The file '{os.path.basename(filename)}{extension}' is not yet supported by this module."
)
except MemoryError:
if verbose:
print("[dextractor]", end=" ")
print(Fore.RED + "ERROR:", end=" ")
print(f"The file '{os.path.basename(filename)}{extension}' is too large.")
if verbose:
print("[dextractor]", end=" ")
print(Fore.RED + "ERROR:", end=" ")
print(
f"The file '{os.path.basename(filename)}{extension}' is too large."
)
except IOError:
if verbose:
print("[dextractor]", end=" ")
print(Fore.RED + "ERROR:", end=" ")
print(f"The file '{os.path.basename(filename)}{extension}' could not be accessed.")
if verbose:
print("[dextractor]", end=" ")
print(Fore.RED + "ERROR:", end=" ")
print(
f"The file '{os.path.basename(filename)}{extension}' could not be accessed."
)
else:
raise Exception(
"This is not a file or a directory. It might be a special file (e.g. socket, FIFO, device file), which is unsupported by this package. "
Expand Down
69 changes: 0 additions & 69 deletions dextractor/expressions.py

This file was deleted.

2 changes: 1 addition & 1 deletion dextractor/exclusions.py → dextractor/src/exclusions.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,5 +54,5 @@
".npy",
".npz",
".ini",
".inc"
".inc",
]
90 changes: 90 additions & 0 deletions dextractor/src/languages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# ---------
# This source file is part of the Dependency Extractor python open source package.
# Copyright (c) 2021, Alexandros Raikos tou Konstantinou.
#
# Licensed under the MIT License.
# ---------
#
# Define supported languages with their corresponding compiled import regex.
# -----
# NOTE: Strict suffix queries exclude local and relative imports.
# NOTE: These expressions were formulated with the help of https://regex101.com.

import re
from typing import List


class ProgrammingLanguage:
def __init__(self, name, extensions, expressions):
self.name: str = name
self.extensions: List[str] = extensions
self.expressions = expressions


supported_languages = [
ProgrammingLanguage(
"C++",
extensions=[".cpp", ".hpp"],
expressions={
"dependencies": {
"regular": re.compile(
r"#include [<\"](?P<dependency>[a-zA-Z0-9!@#$%^&*()_+\-=\[\]{};':\"\\|,.<>\/?]+)[\">]"
),
"strict": re.compile(
r"#include [<\"](?P<dependency>[^.][a-zA-Z0-9!@#$%^&*()_+\-=\[\]{};':\"\\|,.<>\/?]+[^.hpp][^.h])[\">]"
),
}
},
),
ProgrammingLanguage(
"C",
extensions=[".c", ".h"],
expressions={
"dependencies": {
"regular": re.compile(
r"#include [<\"](?P<dependency>[a-zA-Z0-9!@#$%^&*()_+\-=\[\]{};':\"\\|,.<>\/?]+)[\">]"
),
"strict": re.compile(
r"#include [<\"](?P<dependency>[^.][a-zA-Z0-9!@#$%^&*()_+\-=\[\]{};':\"\\|,.<>\/?]+[^.hpp][^.h])[\">]"
),
}
},
),
# TODO: #1 Needs improvement (it only reads last dependency in list + strict mode.)
ProgrammingLanguage(
"Go",
extensions=[".go"],
expressions={
"dependencies": {
"regular": re.compile(
r"import \(\n(?:\t\"(?P<dependency>[a-zA-Z0-9!@#$%^&*()_+\-\[\]{};':\"\\.\/?]+)\"[\n]+)+\)"
)
}
},
),
ProgrammingLanguage(
"Java",
extensions=[".java"],
expressions={
"dependencies": {
"regular": re.compile(
r"import (?P<dependency>[a-zA-Z0-9!@#$%^&*_+\-\[\]{};':\"\\.\/?]+);"
)
}
},
),
ProgrammingLanguage(
"Python",
extensions=[".py", ".pyi"],
expressions={
"dependencies": {
"regular": re.compile(
r"^(?:[ ]|)+(?:import|from) (?P<dependency>[^_][a-zA-Z0-9!@#$%^&*()_+\-\[\]{}.;':\"\\\/?]+)"
),
"strict": re.compile(
r"^(?:[ ]|)+(?:import|from) (?P<dependency>[^_.][a-zA-Z0-9!@#$%^&*()_+\-\[\]{};':\"\\\/?]+)"
),
}
},
),
]
Loading

0 comments on commit 5c97651

Please sign in to comment.