diff --git a/frontend/components/Datasets.tsx b/frontend/components/Datasets.tsx
index 6ec67c9..89fce4f 100644
--- a/frontend/components/Datasets.tsx
+++ b/frontend/components/Datasets.tsx
@@ -15,6 +15,7 @@ import {
SkeletonCircle,
SkeletonText,
Link,
+ Image as ChakraImage,
} from "@chakra-ui/react";
import Image from "next/image";
import axios from "axios";
@@ -127,19 +128,44 @@ export default function Datasets() {
>
-
+
+ Early on in our journey, we recognized that advancing Indian
+ technology necessitates large-scale datasets. Thus, building and
+ collecting extensive datasets across multiple verticals has become a
+ critical endeavor at AI4Bharat. Thanks to generous grants from
+ MeitY, we are spearheading pioneering efforts in data collection as
+ part of the Data Management Unit of Bhashini. Our nationwide
+ initiative aims to gather 15,000 hours of transcribed data from over
+ 400 districts, encompassing all 22 scheduled languages of India. In
+ parallel, our in-house team of over 100 translators is diligently
+ creating a parallel corpus with 2.2 million translation pairs across
+ 22 languages. To produce studio-quality data for expressive TTS
+ systems, we have established recording studios in our lab, where
+ professional voice artists contribute their expertise. Additionally,
+ our annotators are meticulously labeling pages for Document Layout
+ Parsing, accommodating the diverse scripts of India. To accelerate
+ the development of Indic Large Language Models (LLMs), we are
+ focused on building pipelines for curating and synthetically
+ generating pre-training data, collecting contextually grounded
+ prompts, and creating evaluation datasets that reflect India’s rich
+ linguistic tapestry. Collecting and annotating data at this scale
+ demands standardization of processes and tools. To meet this
+ challenge, AI4Bharat has invested in developing various open-source
+ data collection and annotation tools, aiming to enhance these
+ efforts not only within India but also in multilingual regions
+ across the globe.
+
{isLoading ? (
diff --git a/frontend/components/Dynamic/Area.tsx b/frontend/components/Dynamic/Area.tsx
index b673f19..f52ad75 100644
--- a/frontend/components/Dynamic/Area.tsx
+++ b/frontend/components/Dynamic/Area.tsx
@@ -23,7 +23,7 @@ const areaInfo: { [key: string]: { title: string; description: string } } = {
nmt: {
title: "Machine Translation",
description:
- "AI4Bharat is a pioneering initiative focused on building open-source AI solutions that address challenges unique to India. One of their significant contributions is in the field of machine translation, where they aim to bridge the linguistic diversity of the country. AI4Bharat has developed state-of-the-art models that facilitate the translation of text between Indian languages, enabling seamless communication across different linguistic communities. Their work includes creating large-scale datasets, fine-tuning models for regional languages, and ensuring these tools are accessible to developers and researchers. This initiative not only promotes inclusivity but also helps preserve the rich linguistic heritage of India by making digital content available in multiple languages.",
+ "Our machine translation models, including IndicTransv2, are built on large-scale datasets mined from the web and carefully curated human translations, catering to all 22 Indian languages and competing with commercial models as validated on multiple benchmarks.",
},
llm: {
title: "Large Language Models",
@@ -40,6 +40,21 @@ const areaInfo: { [key: string]: { title: string; description: string } } = {
models, while ensuring diversity in their generation capabilities, thereby advancing the frontier of
language technology for India’s diverse linguistic landscape.`,
},
+ asr: {
+ title: "Automatic Speech Recognition",
+ description:
+ "Our ASR models, including IndicWav2Vec and IndicWhisper, are trained on rich datasets like Kathbath, Shrutilipi and IndicVoices, covering multiple Indian languages.",
+ },
+ tts: {
+ title: "Speech Synthesis",
+ description:
+ "AI4Bharat’s TTS efforts, exemplified by AI4BTTS, focus on creating natural-sounding synthetic voices for Indian languages using a mix of web-crawled data and carefully curated datasets like Rasa.",
+ },
+ xlit: {
+ title: "Transliteration",
+ description:
+ "AI4Bharat’s transliteration models, like IndicXlit, are optimized for converting text between scripts of Indian languages and English, leveraging large scale datasets such as Aksharantar",
+ },
};
const fetchAreaData = async (slug: string) => {
diff --git a/frontend/components/Features.tsx b/frontend/components/Features.tsx
index 81ab991..281f12d 100644
--- a/frontend/components/Features.tsx
+++ b/frontend/components/Features.tsx
@@ -103,7 +103,7 @@ export default function Features() {
/>
}
description={
- "AI4Bharat has pioneered the development of multilingual LLMs tailored for Indian languages, such as IndicBERT, IndicBART, and Airavata trained on extensive, diverse datasets like IndicCorpora and Sangraha."
+ "Our machine translation models, including IndicTransv2, are built on large-scale datasets mined from the web and carefully curated human translations, catering to all 22 Indian languages and competing with commercial models as validated on multiple benchmarks."
}
href={`${imagePrefix}/areas/nmt`}
/>
@@ -118,7 +118,7 @@ export default function Features() {
/>
}
description={
- "AI4Bharat has pioneered the development of multilingual LLMs tailored for Indian languages, such as IndicBERT, IndicBART, and Airavata trained on extensive, diverse datasets like IndicCorpora and Sangraha."
+ "AI4Bharat’s transliteration models, like IndicXlit, are optimized for converting text between scripts of Indian languages and English, leveraging large scale datasets such as Aksharantar"
}
href={`${imagePrefix}/areas/xlit`}
/>
@@ -133,7 +133,7 @@ export default function Features() {
/>
}
description={
- "AI4Bharat has pioneered the development of multilingual LLMs tailored for Indian languages, such as IndicBERT, IndicBART, and Airavata trained on extensive, diverse datasets like IndicCorpora and Sangraha."
+ "Our ASR models, including IndicWav2Vec and IndicWhisper, are trained on rich datasets like Kathbath, Shrutilipi and IndicVoices, covering multiple Indian languages."
}
href={`${imagePrefix}/areas/asr`}
/>
@@ -148,7 +148,7 @@ export default function Features() {
/>
}
description={
- "AI4Bharat has pioneered the development of multilingual LLMs tailored for Indian languages, such as IndicBERT, IndicBART, and Airavata trained on extensive, diverse datasets like IndicCorpora and Sangraha."
+ "AI4Bharat’s TTS efforts, exemplified by AI4BTTS, focus on creating natural-sounding synthetic voices for Indian languages using a mix of web-crawled data and carefully curated datasets like Rasa."
}
href={`${imagePrefix}/areas/tts`}
/>
diff --git a/frontend/package-lock.json b/frontend/package-lock.json
index 3b7eb75..d2c4818 100644
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
@@ -22,6 +22,7 @@
"markdown-to-jsx": "^7.5.0",
"next": "14.2.5",
"react": "^18",
+ "react-audio-voice-recorder": "^2.2.0",
"react-dom": "^18",
"react-icons": "^5.3.0",
"react-markdown": "^9.0.1",
@@ -1751,6 +1752,27 @@
"node": "^12.22.0 || ^14.17.0 || >=16.0.0"
}
},
+ "node_modules/@ffmpeg/ffmpeg": {
+ "version": "0.11.6",
+ "resolved": "https://registry.npmjs.org/@ffmpeg/ffmpeg/-/ffmpeg-0.11.6.tgz",
+ "integrity": "sha512-uN8J8KDjADEavPhNva6tYO9Fj0lWs9z82swF3YXnTxWMBoFLGq3LZ6FLlIldRKEzhOBKnkVfA8UnFJuvGvNxcA==",
+ "license": "MIT",
+ "dependencies": {
+ "is-url": "^1.2.4",
+ "node-fetch": "^2.6.1",
+ "regenerator-runtime": "^0.13.7",
+ "resolve-url": "^0.2.1"
+ },
+ "engines": {
+ "node": ">=12.16.1"
+ }
+ },
+ "node_modules/@ffmpeg/ffmpeg/node_modules/regenerator-runtime": {
+ "version": "0.13.11",
+ "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.11.tgz",
+ "integrity": "sha512-kY1AZVr2Ra+t+piVaJ4gxaFaReZVH40AKNo7UCX6W+dEwBo/2oZJzqfuN1qLq1oL45o56cPaTXELwrTh8Fpggg==",
+ "license": "MIT"
+ },
"node_modules/@humanwhocodes/config-array": {
"version": "0.11.14",
"resolved": "https://registry.npmjs.org/@humanwhocodes/config-array/-/config-array-0.11.14.tgz",
@@ -5729,6 +5751,12 @@
"url": "https://github.com/sponsors/ljharb"
}
},
+ "node_modules/is-url": {
+ "version": "1.2.4",
+ "resolved": "https://registry.npmjs.org/is-url/-/is-url-1.2.4.tgz",
+ "integrity": "sha512-ITvGim8FhRiYe4IQ5uHSkj7pVaPDrCTkNd3yq3cV7iZAcJdHTUMPMEHcqSOy9xZ9qFenQCvi+2wjH9a1nXqHww==",
+ "license": "MIT"
+ },
"node_modules/is-weakmap": {
"version": "2.0.2",
"resolved": "https://registry.npmjs.org/is-weakmap/-/is-weakmap-2.0.2.tgz",
@@ -7112,6 +7140,26 @@
}
}
},
+ "node_modules/node-fetch": {
+ "version": "2.7.0",
+ "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz",
+ "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==",
+ "license": "MIT",
+ "dependencies": {
+ "whatwg-url": "^5.0.0"
+ },
+ "engines": {
+ "node": "4.x || >=6.0.0"
+ },
+ "peerDependencies": {
+ "encoding": "^0.1.0"
+ },
+ "peerDependenciesMeta": {
+ "encoding": {
+ "optional": true
+ }
+ }
+ },
"node_modules/object-assign": {
"version": "4.1.1",
"resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
@@ -7587,6 +7635,30 @@
"node": ">=0.10.0"
}
},
+ "node_modules/react-audio-visualize": {
+ "version": "1.1.3",
+ "resolved": "https://registry.npmjs.org/react-audio-visualize/-/react-audio-visualize-1.1.3.tgz",
+ "integrity": "sha512-gvKcmyfJP6XQrzCJeEK4z/lBL/qijqWD6JArZBfaaf5ZxEQPMfQhTipG0LJIUcBYxzVO1rvIhj4ex+/5MRYTFA==",
+ "license": "MIT",
+ "peerDependencies": {
+ "react": ">=16.2.0",
+ "react-dom": ">=16.2.0"
+ }
+ },
+ "node_modules/react-audio-voice-recorder": {
+ "version": "2.2.0",
+ "resolved": "https://registry.npmjs.org/react-audio-voice-recorder/-/react-audio-voice-recorder-2.2.0.tgz",
+ "integrity": "sha512-Hq+143Zs99vJojT/uFvtpxUuiIKoLbMhxhA7qgxe5v8hNXrh5/qTnvYP92hFaE5V+GyoCXlESONa0ufk7t5kHQ==",
+ "license": "MIT",
+ "dependencies": {
+ "@ffmpeg/ffmpeg": "^0.11.6",
+ "react-audio-visualize": "^1.1.3"
+ },
+ "peerDependencies": {
+ "react": ">=16.2.0",
+ "react-dom": ">=16.2.0"
+ }
+ },
"node_modules/react-clientside-effect": {
"version": "1.2.6",
"resolved": "https://registry.npmjs.org/react-clientside-effect/-/react-clientside-effect-1.2.6.tgz",
@@ -8279,6 +8351,13 @@
"url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1"
}
},
+ "node_modules/resolve-url": {
+ "version": "0.2.1",
+ "resolved": "https://registry.npmjs.org/resolve-url/-/resolve-url-0.2.1.tgz",
+ "integrity": "sha512-ZuF55hVUQaaczgOIwqWzkEcEidmlD/xl44x1UZnhOXcYuFN2S6+rcxpG+C1N3So0wvNI3DmJICUFfu2SxhBmvg==",
+ "deprecated": "https://github.com/lydell/resolve-url#deprecated",
+ "license": "MIT"
+ },
"node_modules/reusify": {
"version": "1.0.4",
"resolved": "https://registry.npmjs.org/reusify/-/reusify-1.0.4.tgz",
@@ -8934,6 +9013,12 @@
"integrity": "sha512-BiZS+C1OS8g/q2RRbJmy59xpyghNBqrr6k5L/uKBGRsTfxmu3ffiRnd8mlGPUVayg8pvfi5urfnu8TU7DVOkLQ==",
"license": "MIT"
},
+ "node_modules/tr46": {
+ "version": "0.0.3",
+ "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz",
+ "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==",
+ "license": "MIT"
+ },
"node_modules/trim-lines": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/trim-lines/-/trim-lines-3.0.1.tgz",
@@ -9357,6 +9442,22 @@
"url": "https://github.com/sponsors/wooorm"
}
},
+ "node_modules/webidl-conversions": {
+ "version": "3.0.1",
+ "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz",
+ "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==",
+ "license": "BSD-2-Clause"
+ },
+ "node_modules/whatwg-url": {
+ "version": "5.0.0",
+ "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz",
+ "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==",
+ "license": "MIT",
+ "dependencies": {
+ "tr46": "~0.0.3",
+ "webidl-conversions": "^3.0.0"
+ }
+ },
"node_modules/which": {
"version": "2.0.2",
"resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz",
diff --git a/frontend/package.json b/frontend/package.json
index 44c98ca..9478f60 100644
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -23,6 +23,7 @@
"markdown-to-jsx": "^7.5.0",
"next": "14.2.5",
"react": "^18",
+ "react-audio-voice-recorder": "^2.2.0",
"react-dom": "^18",
"react-icons": "^5.3.0",
"react-markdown": "^9.0.1",