Skip to content

Commit

Permalink
Adding XLS UI elements for s3 source
Browse files Browse the repository at this point in the history
  • Loading branch information
psainics committed Dec 11, 2023
1 parent 1779c2c commit 26da464
Show file tree
Hide file tree
Showing 3 changed files with 151 additions and 4 deletions.
18 changes: 16 additions & 2 deletions docs/S3-batchsource.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,28 @@ the credentials does not need to be provided.
**Path:** Path to read from. For example, s3a://<bucket>/path/to/input

**Format:** Format of the data to read.
The format must be one of 'avro', 'blob', 'csv', 'delimited', 'json', 'parquet', 'text', or 'tsv'.
The format must be one of 'avro', 'blob', 'csv', 'delimited', 'json', 'parquet', 'text', 'tsv' or 'xls'.
If the format is 'blob', every input file will be read into a separate record.
The 'blob' format also requires a schema that contains a field named 'body' of type 'bytes'.
If the format is 'text', the schema must contain a field named 'body' of type 'string'.

**Sample Size:** The maximum number of rows that will get investigated for automatic data type detection.
The default value is 1000.

**Override:** A list of columns with the corresponding data types for whom the automatic data type detection gets
skipped.

**Terminate If Empty Row:** Whether to terminate the file reading if an empty row is encountered.
The default value is false.

**Select Sheet Using:** Select the sheet by name or number. Default is 'Sheet Number'.

**Sheet Value:** The name/number of the sheet to read from. If not specified, the first sheet will be read.
Sheet Number are 0 based, ie first sheet is 0.

**Delimiter:** Delimiter to use when the format is 'delimited'. This will be ignored for other formats.

**Use First Row as Header:** Whether to use first row as header. Supported formats are 'text', 'csv', 'tsv', 'delimited'.
**Use First Row as Header:** Whether to use first row as header. Supported formats are 'text', 'csv', 'tsv', 'delimited', 'xls'.

**Authentication Method:** Authentication method to access S3. The default value is Access Credentials.
IAM can only be used if the plugin is run in an AWS environment, such as on EMR.
Expand Down
33 changes: 33 additions & 0 deletions src/main/java/io/cdap/plugin/aws/s3/source/S3BatchSource.java
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,9 @@ public static class S3BatchConfig extends AbstractFileSourceConfig {
public static final String NAME_PATH = "path";
private static final String NAME_FILE_SYSTEM_PROPERTIES = "fileSystemProperties";
private static final String NAME_DELIMITER = "delimiter";
private static final String NAME_SHEET = "sheet";
private static final String NAME_SHEET_VALUE = "sheetValue";
private static final String NAME_TERMINATE_IF_EMPTY_ROW = "terminateIfEmptyRow";

private static final Gson GSON = new Gson();
private static final Type MAP_STRING_STRING_TYPE = new TypeToken<Map<String, String>>() { }.getType();
Expand Down Expand Up @@ -174,6 +177,36 @@ public static class S3BatchConfig extends AbstractFileSourceConfig {
"fail, if credentials are incorrect. The default value is false.")
private Boolean verifyCredentials;

@Macro
@Nullable
@Description("The maximum number of rows that will get investigated for automatic data type detection.")
private Long sampleSize;

@Macro
@Nullable
@Description("A list of columns with the corresponding data types for whom the automatic data type detection gets" +
" skipped.")
private String override;

@Name(NAME_SHEET)
@Macro
@Nullable
@Description("Select the sheet by name or number. Default is 'Sheet Number'.")
private String sheet;

@Name(NAME_SHEET_VALUE)
@Macro
@Nullable
@Description("The name/number of the sheet to read from. If not specified, the first sheet will be read." +
"Sheet Number are 0 based, ie first sheet is 0.")
private String sheetValue;

@Name(NAME_TERMINATE_IF_EMPTY_ROW)
@Macro
@Nullable
@Description("Whether to terminate the pipeline if an empty row is encountered. Default is 'false'.")
private String terminateIfEmptyRow;

private S3BatchConfig(String path, @Nullable S3ConnectorConfig connection, String fileSystemProperties,
Boolean verifyCredentials) {
super();
Expand Down
104 changes: 102 additions & 2 deletions widgets/S3-batchsource.json
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,8 @@
"json",
"parquet",
"text",
"tsv"
"tsv",
"xls"
],
"default": "text"
},
Expand All @@ -135,6 +136,36 @@
"plugin-method": "getSchema"
}
},
{
"widget-type": "number",
"label": "Sample Size",
"name": "sampleSize",
"widget-attributes": {
"default": "1000",
"minimum": "1"
}
},
{
"widget-type": "keyvalue-dropdown",
"label": "Override",
"name": "override",
"widget-attributes": {
"key-placeholder": "Field Name",
"value-placeholder": "Data Type",
"dropdownOptions": [
"boolean",
"bytes",
"double",
"float",
"int",
"long",
"string",
"date",
"time",
"timestamp"
]
}
},
{
"widget-type": "textbox",
"label": "Delimiter",
Expand Down Expand Up @@ -174,6 +205,42 @@
"label": "False"
}
}
},
{
"widget-type": "toggle",
"label": "Terminate If Empty Row",
"name": "terminateIfEmptyRow",
"widget-attributes": {
"default": "false",
"on": {
"value": "true",
"label": "True"
},
"off": {
"value": "false",
"label": "False"
}
}
},
{
"widget-type": "select",
"label": "Select Sheet Using",
"name": "sheet",
"widget-attributes": {
"values": [
"Sheet Name",
"Sheet Number"
],
"default": "Sheet Number"
}
},
{
"widget-type": "textbox",
"label": "Sheet Value",
"name": "sheetValue",
"widget-attributes": {
"default": "0"
}
}
]
},
Expand Down Expand Up @@ -674,13 +741,46 @@
{
"name": "skipHeader",
"condition": {
"expression": "format == 'delimited' || format == 'csv' || format == 'tsv'"
"expression": "format == 'delimited' || format == 'csv' || format == 'tsv' || format == 'xls'"
},
"show": [
{
"name": "skipHeader"
}
]
},
{
"name": "sheet",
"condition": {
"expression": "format == 'xls'"
},
"show": [
{
"name": "sheet"
}
]
},
{
"name": "sheetValue",
"condition": {
"expression": "format == 'xls'"
},
"show": [
{
"name": "sheetValue"
}
]
},
{
"name": "terminateIfEmptyRow",
"condition": {
"expression": "format == 'xls'"
},
"show": [
{
"name": "terminateIfEmptyRow"
}
]
}
]
}

0 comments on commit 26da464

Please sign in to comment.