Skip to content

Commit

Permalink
Improve CSV parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
mantas-done committed Feb 16, 2024
1 parent 3152cd6 commit 3c2d167
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 34 deletions.
83 changes: 49 additions & 34 deletions src/Code/Converters/CsvConverter.php
Original file line number Diff line number Diff line change
Expand Up @@ -54,51 +54,66 @@ public function canParseFileContent($file_content)
public function fileContentToInternalFormat($file_content, $original_file_content)
{
$data = self::csvToArray(trim($file_content));
$data_string = '';

$is_start_time = (bool) preg_match(self::timeRegex(), $data[1][0]);
$is_end_time = (bool) preg_match(self::timeRegex(), $data[1][1]);
if ($is_end_time && !isset($data[1][2])) {
throw new UserException('No text (CsvConverter)');
}

// format integers to float for txt converter
$has_heading = !(bool) preg_match(self::timeRegex(), $data[0][0]);
$start = 0;
if ($has_heading) {
$start = 1;
$start_time_column = null;
$end_time_column = null;
$text_column = null;
$last_row = end($data);
$column_count = count($last_row);
$checked_column = 0;
foreach ($last_row as $k => $column) {
if (preg_match(self::timeRegex(), $column)) {
$start_time_column = $k;
$checked_column = $k;
break;
}
}
if ($is_start_time && is_numeric($data[1][0])) {
for ($i = $start; $i < count($data); $i++) {
if (!is_numeric($data[$i][0])) {
throw new UserException("Can't parse this timestamp: " . $data[$i][0]);
if ($start_time_column !== null) {
for ($i = $checked_column + 1; $i < $column_count; $i++) {
$column = $last_row[$i];
if (preg_match(self::timeRegex(), $column)) {
$end_time_column = $i;
$checked_column = $i;
break;
}
$data[$i][0] = number_format($data[$i][0], 3, '.', '');
}
}
if ($is_end_time && is_numeric($data[1][1])) {
for ($i = $start; $i < count($data); $i++) {
$data[$i][1] = number_format($data[$i][1], 3, '.', '');
for ($i = $checked_column + 1; $i < $column_count; $i++) {
$column = $last_row[$i];
if (TxtConverter::hasText($column)) {
$text_column = $i;
break;
}
}

foreach ($data as $k => $row) {
$timestamp_found = (bool) preg_match(self::timeRegex(), $row[0]);
if ($k === 0 && $timestamp_found === false) { // heading
continue;
if ($text_column === null) {
throw new UserException('No text (CsvConverter)');
}

$data_string = '';
foreach ($data as $row) {
if ($start_time_column !== null) {
$is_start_time = preg_match(self::timeRegex(), $row[$start_time_column]);
if (!$is_start_time) {
continue; // skip few first rows if label or empty
}
}

// format csv file as a txt file, so TxtConverter would be able to understand it
if ($is_start_time && $is_end_time) {
$data_string .= $row[0] . ' ' . $row[1] . "\n"; // start end
$data_string .= $row[2] . "\n"; // text
} elseif ($is_start_time) {
$data_string .= $row[0] . "\n"; // start
$data_string .= $row[1] . "\n"; // text
} else {
$data_string .= $row[0] . "\n"; // text
if ($start_time_column !== null) {
$start_time = $row[$start_time_column];
if (is_numeric($start_time)) {
$start_time = number_format($start_time, 3, '.', '');
}
$data_string .= "\n" . $start_time;
}
if ($end_time_column !== null) {
$end_time = $row[$end_time_column];
if (is_numeric($end_time)) {
$end_time = number_format($end_time, 3, '.', '');
}
$data_string .= ' ' . $end_time;
}
$data_string .= "\n";
$data_string .= "\n" . $row[$text_column];
}

return (new TxtConverter)->fileContentToInternalFormat($data_string, '');
Expand Down
17 changes: 17 additions & 0 deletions tests/formats/CsvTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -175,4 +175,21 @@ public function testWrongTimestamp()
Subtitles::loadFromString($string)->getInternalFormat();

}

public function testGapsInFront()
{
$string = <<<TEXT
,,
,Timecode,Subtitle
,0:06,"Hello, my name is Cindy Takehara."
,0:08,I was the project lead for this sound workshop.
TEXT;
$actual_internal_format = Subtitles::loadFromString($string)->getInternalFormat();
$expected_internal_format = (new Subtitles())
->add(6, 8, 'Hello, my name is Cindy Takehara.')
->add(8, 9, 'I was the project lead for this sound workshop.')->getInternalFormat();

$this->assertInternalFormatsEqual($expected_internal_format, $actual_internal_format);

}
}

0 comments on commit 3c2d167

Please sign in to comment.