Skip to content

Commit

Permalink
v1.6 commit
Browse files Browse the repository at this point in the history
  • Loading branch information
drojaazu committed Nov 21, 2022
2 parents e5c22ad + 4afbf71 commit 9b737cd
Show file tree
Hide file tree
Showing 40 changed files with 1,109 additions and 625 deletions.
Binary file added .cache/clangd/index/app.hpp.6DDEDC001E61ADFE.idx
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added .cache/clangd/index/jstrings.hpp.4DF5880E965F6154.idx
Binary file not shown.
Binary file added .cache/clangd/index/main.cpp.FDFB40A6425EB388.idx
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added .cache/clangd/index/usage.hpp.CC50E5353C0C2017.idx
Binary file not shown.
38 changes: 32 additions & 6 deletions .clang-format
Original file line number Diff line number Diff line change
@@ -1,22 +1,48 @@
Language: Cpp
BasedOnStyle: LLVM

AlignAfterOpenBracket: DontAlign
AlignEscapedNewlines: DontAlign
AlignOperands: Align
AlignTrailingComments: true
AllowAllArgumentsOnNextLine: true
AllowAllConstructorInitializersOnNextLine: true
AlignArrayOfStructures: None
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: Empty
AllowShortIfStatementsOnASingleLine: false
BreakBeforeBraces: Custom
BreakConstructorInitializers: AfterColon
AllowShortLoopsOnASingleLine: false
AllowShortIfStatementsOnASingleLine: Never
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: Yes
BreakBeforeBraces: Allman
Cpp11BracedListStyle: false
BreakBeforeTernaryOperators: true
BreakConstructorInitializers: AfterColon
BinPackArguments: false
BinPackParameters: false
ColumnLimit: 120
ContinuationIndentWidth: 2
Cpp11BracedListStyle: true
Language: Cpp
MaxEmptyLinesToKeep: 1
IndentCaseLabels: true
PackConstructorInitializers: Never
PointerAlignment: Middle
ReferenceAlignment: Pointer
SortIncludes: true
SpaceAfterCStyleCast: true
SpaceAfterLogicalNot: true
SpaceAroundPointerQualifiers: Both
SpaceBeforeAssignmentOperators: true
SpaceBeforeCpp11BracedList: true
SpaceBeforeCtorInitializerColon: true
SpaceBeforeParens: Never
SpaceBeforeInheritanceColon: true
SpaceBeforeRangeBasedForLoopColon: true
SpaceBeforeParens: ControlStatements
SpacesInAngles: false
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: Cpp11
TabWidth: 2
UseTab: Always
AlwaysBreakTemplateDeclarations: Yes
10 changes: 10 additions & 0 deletions .clangd
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
CompileFlags:
# Treat code as C++, use C++17 standard, enable more warnings.
Add: [-xc++, -std=c++17, -Wall]
Diagnostics:
ClangTidy:
Add: [performance*, modernize*]
Remove: [modernize-use-trailing-return-type, modernize-avoid-c-arrays]
CheckOptions:
readability-identifier-naming.VariableCase: SnakeCase

17 changes: 14 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
include(CheckIncludeFiles)
include(GNUInstallDirs)

# define project
cmake_minimum_required(VERSION 3.5)
project(jstrings VERSION 1.5 LANGUAGES CXX)
set(PROJECT_CONTACT "Damian R ([email protected])")

project(jstrings VERSION 1.6 LANGUAGES CXX)

set(PROJECT_CONTACT "Damian R ([email protected])")
set(PROJECT_WEBSITE "https://github.com/drojaazu")
set(PROJECT_COPYRIGHT "©2018 Motoi Productions / Released under MIT License")
set(PROJECT_BRIEF "A tool for finding JIS-based Japanese text in binary data.")

configure_file("${CMAKE_CURRENT_SOURCE_DIR}/src/app.hpp.cfg" "${CMAKE_CURRENT_SOURCE_DIR}/src/app.hpp" ESCAPE_QUOTES)

Expand All @@ -27,5 +30,13 @@ add_executable(${PROJECT_NAME} ${SRCFILES})

target_compile_features(${PROJECT_NAME} PUBLIC cxx_std_17)

find_library(ICONV_LIB iconv)

if(NOT ICONV_LIB)
message(FATAL_ERROR "libiconv not found")
endif()

target_link_libraries(${PROJECT_NAME} iconv)

install(TARGETS jstrings
RUNTIME DESTINATION bin)
49 changes: 35 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,30 +18,51 @@ Specify the encoding to use. Use one of the strings listed in parantheses below

Optional; default is Shift-JIS.

-m number
--match-length number
-l integer
--match-length integer

Set minimum number of characters to match as a valid string. Optional; default is 10.
Set minimum number of characters to match as a valid string.

-c number
--cutoff number
Optional; default is 5.

-c integer
--cutoff integer

Limit the output to the specified number of characters for a string. This is useful for "previewing" a file which may have large blocks of junk data that happen to fall within the range of valid code points. Strings that are cut off will be appended with an ellipsis.

Note that the length is in bytes, not characters. As such, due to the variable width nature of UTF-8, there is a chance the final character displayed may be incorrect. STL string functions do not work natively with encodings and the author feels that the work needed to implement this for an optional feature that should only be used for quickly previewing data would be overly complex.

Optional; default is no cutoff.

-m
--multiline

Include newline characters (0x0D or 0x0D0A) as valid. Otherwise, these will count as end of string markers.

Optional; default is disabled.

-r
--raw

Output the data in its original encoding without converting to Unicode.

Optional; default is disabled (will convert output strings to UTF-8).

-s
--skip-jis0201

Skip checking for JIS X 0201 characters. These is an 8 bit katakana-only code space that act as a supplement to ASCII and were generally only used in older home computers. Disabling this can reduce false positives if you are working with newer data.

Optional; default is disabled (will include JIS X 0201 characters).

Limit the output to the specified number of characters for a string. This is useful for "previewing" a file which may have large blocks of junk data that happen to fall within the range of valid encoding values. Optional; default is no cutoff.

-l
--multiline

Do not break the string on CR/LF characters. Such characters will instead appear as /0D and /0A respectively in the output string.

## Output
Data is output in its original encoding without any conversion. Other tools, such as iconv, can do conversion to something more useful (such as UTF8). For example:

# for Shift-JIS
jstrings file.bin | iconv -f SHIFT-JIS -t UTF-8 -c | less
# for CP932
jstrings file.bin | iconv -f CP932 -t UTF-8 -c | less
# for EUC-JP
jstrings file.bin | iconv -f EUC-JP -t UTF-8 -c | less
Found strings are prepended with the offset in which they were found in the original data and sent to stdout. Strings are converted to UTF-8 using libiconv. The original encoding can be preserved by using the `--raw` option.

## Building
CMake is used for the build system. From the root directory:
Expand Down
33 changes: 19 additions & 14 deletions inc/enc_cp932.hpp
Original file line number Diff line number Diff line change
@@ -1,25 +1,30 @@
/*!
* \author Damian Rogers ([email protected])
* \version 1.1
* \date 2019.12.01
* \copyright MIT License
*/

#ifndef ENC_CP932_H
#define ENC_CP932_H
#ifndef ENC_CP932_HPP
#define ENC_CP932_HPP

#include "enc_shiftjis.hpp"

namespace encodings
namespace motoi
{

class encoding_cp932 : public encoding_shiftjis
/**
* @brief Determines if input data points to a valid MS Code Page 932 code point
*
* @return size_t Returns 0 if not a valid sequence; otherwise returns the
* number of bytes in the code point sequence
*
*/
class cp932_validator : public shiftjis_validator
{
public:
u8 is_valid(u8 const * data);
~encoding_cp932() {};
cp932_validator()
{
m_iconv_code = "CP932";
}

uint is_valid(byte_t const * data) const override;
~cp932_validator() override = default;
};

} // namespace encodings
} // namespace motoi

#endif
38 changes: 22 additions & 16 deletions inc/enc_eucjp.hpp
Original file line number Diff line number Diff line change
@@ -1,23 +1,29 @@
/*!
* \author Damian Rogers ([email protected])
* \version 1.1
* \date 2019.12.01
* \copyright MIT License
*/
#ifndef ENC_EUCJP_H
#define ENC_EUCJP_H
#include "encoding.hpp"
#ifndef ENC_EUCJP_HPP
#define ENC_EUCJP_HPP

#include "jis_validator.hpp"

namespace encodings
namespace motoi
{

class encoding_eucjp : public encoding
/**
* @brief Determines if input data points to a valid EUC JP code point
*
* @return size_t Returns 0 if not a valid sequence; otherwise returns the
* number of bytes in the code point sequence
*
*/
class eucjp_validator : public jis_validator
{
private:
constexpr static char const * m_iconvcode = "EUC-JP";

public:
encoding_eucjp() : encoding(3) {};
u8 is_valid(u8 const * data);
~encoding_eucjp() {};
eucjp_validator() :
jis_validator(3, m_iconvcode) {};
uint is_valid(byte_t const * data) const override;
~eucjp_validator() override = default;
};

} // namespace encodings
#endif // ENC_EUC_H
} // namespace motoi
#endif
42 changes: 25 additions & 17 deletions inc/enc_shiftjis.hpp
Original file line number Diff line number Diff line change
@@ -1,24 +1,32 @@
/*!
* \author Damian Rogers ([email protected])
* \version 1.1
* \date 2019.12.01
* \copyright MIT License
*/

#ifndef ENC_SHIFTJIS_H
#define ENC_SHIFTJIS_H
#include "encoding.hpp"
#ifndef ENC_SHIFTJIS_HPP
#define ENC_SHIFTJIS_HPP
#include "jis_validator.hpp"

namespace encodings
namespace motoi
{

class encoding_shiftjis : public encoding
/**
* @brief Determines if input data points to a valid Shift-JIS code point
*
* @return size_t Returns 0 if not a valid sequence; otherwise returns the
* number of bytes in the code point sequence
*
* @details This supports traditional Shift-JIS, which encompasses JIS X 0201 and
* JIS X 0208 character sets. It does not currently support the JIS X 0213
* extension set.
*/
class shiftjis_validator : public jis_validator
{
private:
constexpr static char const * m_iconvcode = "SHIFT-JIS";

public:
encoding_shiftjis() : encoding(2) {};
u8 is_valid(u8 const * data);
~encoding_shiftjis() {};
shiftjis_validator() :
jis_validator(2, m_iconvcode) {};
uint is_valid(byte_t const * data) const override;
~shiftjis_validator() override = default;
;
};

} // namespace encodings
#endif // ENC_SHIFTJIS_H
} // namespace motoi
#endif
43 changes: 43 additions & 0 deletions inc/enc_validator.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#ifndef ENCODING_HPP
#define ENCODING_HPP

#include "types.hpp"

namespace motoi
{
/*!
* \brief Abstract for encoding validator classes
*/
class encoding_validator
{
protected:
uint m_max_seq_len;

encoding_validator(uint max_seq_len) :
m_max_seq_len {max_seq_len}
{
}

public:
encoding_validator() = delete;
virtual ~encoding_validator() = default;

/*!
* \brief Determines if the given bytes are a valid byte sequence for the encoding.
* Returns the number of valid bytes if true.
*/
virtual uint is_valid(byte_t const * data) const = 0;

uint operator()(byte_t const * data) const
{
return is_valid(data);
}

[[nodiscard]] uint max_seq_len() const
{
return m_max_seq_len;
}
};
} // namespace motoi

#endif
40 changes: 0 additions & 40 deletions inc/encoding.hpp

This file was deleted.

Loading

0 comments on commit 9b737cd

Please sign in to comment.