Long sentence recognition

Prev Next

Available in Classic and VPC

This document introduces the long sentence recognition examples of the CLOVA Speech service.

Java

The following is a Java-based sample code for the API.

<dependency>
    <groupId>org.apache.httpcomponents</groupId>
    <artifactId>httpclient</artifactId>
    <version>4.5.12</version>
</dependency>
<dependency>
    <groupId>org.apache.httpcomponents</groupId>
    <artifactId>httpmime</artifactId>
    <version>4.3.1</version>
</dependency>
<dependency>
    <groupId>com.google.code.gson</groupId>
    <artifactId>gson</artifactId>
    <version>2.8.5</version>
</dependency>
package org.example.clovaspeech.client;

import java.io.File;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.StringEntity;
import org.apache.http.entity.mime.MultipartEntityBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicHeader;
import org.apache.http.util.EntityUtils;

import com.google.gson.Gson;

public class ClovaSpeechClient {

    // Clova Speech secret key
	private static final String SECRET = "";
    // Clova Speech invoke URL
	private static final String INVOKE_URL = "";

	private CloseableHttpClient httpClient = HttpClients.createDefault();
	private Gson gson = new Gson();

	private static final Header[] HEADERS = new Header[] {
		new BasicHeader("Accept", "application/json"),
		new BasicHeader("X-CLOVASPEECH-API-KEY", SECRET),
	};

    	public static class Boosting {
		private String words;

		public String getWords() {
			return words;
		}

		public void setWords(String words) {
			this.words = words;
		}
	}

	public static class Diarization {
		private Boolean enable = Boolean.FALSE;
		private Integer speakerCountMin;
		private Integer speakerCountMax;

		public Boolean getEnable() {
			return enable;
		}

		public void setEnable(Boolean enable) {
			this.enable = enable;
		}

		public Integer getSpeakerCountMin() {
			return speakerCountMin;
		}

		public void setSpeakerCountMin(Integer speakerCountMin) {
			this.speakerCountMin = speakerCountMin;
		}

		public Integer getSpeakerCountMax() {
			return speakerCountMax;
		}

		public void setSpeakerCountMax(Integer speakerCountMax) {
			this.speakerCountMax = speakerCountMax;
		}
	}

    public static class Sed {
		private Boolean enable = Boolean.FALSE;

		public Boolean getEnable() {
			return enable;
		}

		public void setEnable(Boolean enable) {
			this.enable = enable;
		}
	}

	public static class NestRequestEntity {
		private String language = "ko-KR";
		//completion optional, sync/async (Set how response results are returned (sync/async), not required parameter)
		private String completion = "sync";
		//optional, used to receive the analyzed results (For retrieving analyzed results, not required parameter)
		private String callback;
		//optional, any data (Enter any callback URL value, not required parameter)
		private Map<String, Object> userdata;
		private Boolean wordAlignment = Boolean.TRUE;
		private Boolean fullText = Boolean.TRUE;
		//boosting object array (Keyword boosting object array)
		private List<Boosting> boostings;
		//comma separated words (Comma-separated keywords)
		private String forbiddens;
		private Diarization diarization;
        private Sed sed;

        public Sed getSed() {
			return sed;
		}

		public void setSed(Sed sed) {
			this.sed = sed;
		}

		public String getLanguage() {
			return language;
		}

		public void setLanguage(String language) {
			this.language = language;
		}

		public String getCompletion() {
			return completion;
		}

		public void setCompletion(String completion) {
			this.completion = completion;
		}

		public String getCallback() {
			return callback;
		}

		public Boolean getWordAlignment() {
			return wordAlignment;
		}

		public void setWordAlignment(Boolean wordAlignment) {
			this.wordAlignment = wordAlignment;
		}

		public Boolean getFullText() {
			return fullText;
		}

		public void setFullText(Boolean fullText) {
			this.fullText = fullText;
		}

		public void setCallback(String callback) {
			this.callback = callback;
		}

		public Map<String, Object> getUserdata() {
			return userdata;
		}

		public void setUserdata(Map<String, Object> userdata) {
			this.userdata = userdata;
		}

		public String getForbiddens() {
			return forbiddens;
		}

		public void setForbiddens(String forbiddens) {
			this.forbiddens = forbiddens;
		}

		public List<Boosting> getBoostings() {
			return boostings;
		}

		public void setBoostings(List<Boosting> boostings) {
			this.boostings = boostings;
		}

		public Diarization getDiarization() {
			return diarization;
		}

		public void setDiarization(Diarization diarization) {
			this.diarization = diarization;
		}
	}

	/**
	 * recognize media using URL (Request speech recognition with external file URL)
	 * @param url required, the media URL (Required parameter, external file URL)
	 * @param nestRequestEntity optional (Not required parameter)
	 * @return string (Return string)
	 */
	public String url(String url, NestRequestEntity nestRequestEntity) {
		HttpPost httpPost = new HttpPost(INVOKE_URL + "/recognizer/url");
		httpPost.setHeaders(HEADERS);
		Map<String, Object> body = new HashMap<>();
		body.put("url", url);
		body.put("language", nestRequestEntity.getLanguage());
		body.put("completion", nestRequestEntity.getCompletion());
		body.put("callback", nestRequestEntity.getCallback());
		body.put("userdata", nestRequestEntity.getCallback());
		body.put("wordAlignment", nestRequestEntity.getWordAlignment());
		body.put("fullText", nestRequestEntity.getFullText());
		body.put("forbiddens", nestRequestEntity.getForbiddens());
		body.put("boostings", nestRequestEntity.getBoostings());
		body.put("diarization", nestRequestEntity.getDiarization());
        body.put("sed", nestRequestEntity.getSed());
		HttpEntity httpEntity = new StringEntity(gson.toJson(body), ContentType.APPLICATION_JSON);
		httpPost.setEntity(httpEntity);
		return execute(httpPost);
	}

	/**
	 * recognize media using Object Storage (Request speech recognition with file URL in Object Storage on NAVER Cloud Platform)
	 * @param dataKey required, the Object Storage key (Required parameter, Object Storage key value)
	 * @param nestRequestEntity optional (Not required parameter)
	 * @return string (Return string)
	 */
	public String objectStorage(String dataKey, NestRequestEntity nestRequestEntity) {
		HttpPost httpPost = new HttpPost(INVOKE_URL + "/recognizer/object-storage");
		httpPost.setHeaders(HEADERS);
		Map<String, Object> body = new HashMap<>();
		body.put("dataKey", dataKey);
		body.put("language", nestRequestEntity.getLanguage());
		body.put("completion", nestRequestEntity.getCompletion());
		body.put("callback", nestRequestEntity.getCallback());
		body.put("userdata", nestRequestEntity.getCallback());
		body.put("wordAlignment", nestRequestEntity.getWordAlignment());
		body.put("fullText", nestRequestEntity.getFullText());
		body.put("forbiddens", nestRequestEntity.getForbiddens());
		body.put("boostings", nestRequestEntity.getBoostings());
		body.put("diarization", nestRequestEntity.getDiarization());
        body.put("sed", nestRequestEntity.getSed());
		StringEntity httpEntity = new StringEntity(gson.toJson(body), ContentType.APPLICATION_JSON);
		httpPost.setEntity(httpEntity);
		return execute(httpPost);
	}

	/**
	 *
	 * recognize media using a file (Request speech recognition after uploading local file)
	 * @param file required, the media file (Required parameter, local file)
	 * @param nestRequestEntity optional (Not required parameter)
	 * @return string (Return string)
	 */
	public String upload(File file, NestRequestEntity nestRequestEntity) {
		HttpPost httpPost = new HttpPost(INVOKE_URL + "/recognizer/upload");
		httpPost.setHeaders(HEADERS);
		HttpEntity httpEntity = MultipartEntityBuilder.create()
			.addTextBody("params", gson.toJson(nestRequestEntity), ContentType.APPLICATION_JSON)
			.addBinaryBody("media", file, ContentType.MULTIPART_FORM_DATA, file.getName())
			.build();
		httpPost.setEntity(httpEntity);
		return execute(httpPost);
	}

	private String execute(HttpPost httpPost) {
		try (final CloseableHttpResponse httpResponse = httpClient.execute(httpPost)) {
			final HttpEntity entity = httpResponse.getEntity();
			return EntityUtils.toString(entity, StandardCharsets.UTF_8);
		} catch (Exception e) {
			throw new RuntimeException(e);
		}
	}

	public static void main(String[] args) {
		final ClovaSpeechClient clovaSpeechClient = new ClovaSpeechClient();
		NestRequestEntity requestEntity = new NestRequestEntity();
		final String result =
			clovaSpeechClient.upload(new File("/data/sample.mp4"), requestEntity);
		//final String result = clovaSpeechClient.url("file URL", requestEntity); 
		//final String result = clovaSpeechClient.objectStorage("Object Storage key", requestEntity);
		System.out.println(result);
	}
}

Python

The following is a Python-based sample code for the API.

import requests
import json


class ClovaSpeechClient:
    # CLOVA Speech invoke URL (Invoke URL issued when registering the app)
    invoke_url = ''
    # CLOVA Speech secret key (Secret key issued when registering the app)
    secret = ''

    def req_url(self, url, completion, callback=None, userdata=None, forbiddens=None, boostings=None, wordAlignment=True, fullText=True, diarization=None, sed=None):
        request_body = {
            'url': url,
            'language': 'ko-KR',
            'completion': completion,
            'callback': callback,
            'userdata': userdata,
            'wordAlignment': wordAlignment,
            'fullText': fullText,
            'forbiddens': forbiddens,
            'boostings': boostings,
            'diarization': diarization,
            'sed': sed,
        }
        headers = {
            'Accept': 'application/json;UTF-8',
            'Content-Type': 'application/json;UTF-8',
            'X-CLOVASPEECH-API-KEY': self.secret
        }
        return requests.post(headers=headers,
                             url=self.invoke_url + '/recognizer/url',
                             data=json.dumps(request_body).encode('UTF-8'))

    def req_object_storage(self, data_key, completion, callback=None, userdata=None, forbiddens=None, boostings=None,
                           wordAlignment=True, fullText=True, diarization=None, sed=None):
        request_body = {
            'dataKey': data_key,
            'language': 'ko-KR',
            'completion': completion,
            'callback': callback,
            'userdata': userdata,
            'wordAlignment': wordAlignment,
            'fullText': fullText,
            'forbiddens': forbiddens,
            'boostings': boostings,
            'diarization': diarization,
            'sed': sed,
        }
        headers = {
            'Accept': 'application/json;UTF-8',
            'Content-Type': 'application/json;UTF-8',
            'X-CLOVASPEECH-API-KEY': self.secret
        }
        return requests.post(headers=headers,
                             url=self.invoke_url + '/recognizer/object-storage',
                             data=json.dumps(request_body).encode('UTF-8'))

    def req_upload(self, file, completion, callback=None, userdata=None, forbiddens=None, boostings=None,
                   wordAlignment=True, fullText=True, diarization=None, sed=None):
        request_body = {
            'language': 'ko-KR',
            'completion': completion,
            'callback': callback,
            'userdata': userdata,
            'wordAlignment': wordAlignment,
            'fullText': fullText,
            'forbiddens': forbiddens,
            'boostings': boostings,
            'diarization': diarization,
            'sed': sed,
        }
        headers = {
            'Accept': 'application/json;UTF-8',
            'X-CLOVASPEECH-API-KEY': self.secret
        }
        print(json.dumps(request_body, ensure_ascii=False).encode('UTF-8'))
        files = {
            'media': open(file, 'rb'),
            'params': (None, json.dumps(request_body, ensure_ascii=False).encode('UTF-8'), 'application/json')
        }
        response = requests.post(headers=headers, url=self.invoke_url + '/recognizer/upload', files=files)
        return response

if __name__ == '__main__':
    # res = ClovaSpeechClient().req_url(url='http://example.com/media.mp3', completion='sync')
    # res = ClovaSpeechClient().req_object_storage(data_key='data/media.mp3', completion='sync')
    res = ClovaSpeechClient().req_upload(file='/data/media.mp3', completion='sync')
    print(res.text)

PHP

The following is a PHP-based sample code for the long sentence recognition API.

<?php

$secret = '';
$invoke_url = '';

function req_url($url, $completion, $callback, $userdata, $forbiddens, $boostings,
                 $wordAlignment, $fullText, $diarization, $sed)
{
    $object = (object)[
        'language' => 'ko-KR',
        'completion' => $completion,
        'callback' => $callback,
        'url' => $url,
        'userdata' => $userdata,
        'forbiddens' => $forbiddens,
        'boostings' => $boostings,
        'wordAlignment' => $wordAlignment,
        'fullText' => $fullText,
        'diarization' => $diarization,
        'sed' => $sed,
    ];
    return execute('/recognizer/url', json_encode($object), array('Content-Type: application/json'));
}

function req_object_storage($dataKey, $completion, $callback, $userdata, $forbiddens, $boostings,
                            $wordAlignment, $fullText, $diarization, $sed)
{
    $object = (object)[
        'language' => 'ko-KR',
        'completion' => $completion,
        'callback' => $callback,
        'dataKey' => $dataKey,
        'userdata' => $userdata,
        'forbiddens' => $forbiddens,
        'boostings' => $boostings,
        'wordAlignment' => $wordAlignment,
        'fullText' => $fullText,
        'diarization' => $diarization,
        'sed' => $sed,
    ];
    return execute('/recognizer/object-storage', json_encode($object), array('Content-Type: application/json'));
}

function req_upload($filePath, $completion, $callback, $userdata, $forbiddens, $boostings,
                    $wordAlignment, $fullText, $diarization, $sed)
{
    $object = (object)[
        'language' => 'ko-KR',
        'completion' => $completion,
        'callback' => $callback,
        'userdata' => $userdata,
        'forbiddens' => $forbiddens,
        'boostings' => $boostings,
        'wordAlignment' => $wordAlignment,
        'fullText' => $fullText,
        'diarization' => $diarization,
        'sed' => $sed,
    ];
    $fields = array(
        'media' => new CURLFile($filePath),
        'params' => json_encode($object),
    );
    return execute('/recognizer/upload', $fields, null);
}

function execute($uri, $postFields, $customHeaders)
{
    try {
        $ch = curl_init($GLOBALS['invoke_url'] . $uri);
        curl_setopt($ch, CURLOPT_POST, true);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'POST');
        curl_setopt($ch, CURLOPT_POSTFIELDS, $postFields);
        curl_setopt($ch, CURLOPT_VERBOSE, true);
        curl_setopt($ch, CURLOPT_TIMEOUT, 600);
        $headers = array();
        $headers[] = 'X-CLOVASPEECH-API-KEY: ' . $GLOBALS['secret'];
        if (!is_null($customHeaders)) {
            $headers = array_merge($headers, $customHeaders);
        }
        curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
        $response = curl_exec($ch);
        $err = curl_error($ch);
        curl_close($ch);
        if ($err) {
            echo 'cURL Error #:' . $err;
            return $err;
        }
        return $response;
    } catch (Exception $E) {
        echo 'Response: ' . $E . '\n';
        return $E->lastResponse;
    }
}

//$response = req_url('https://example.com/sample.mp4', 'sync', null, null, null, null, null, null, null);
//$response = req_object_storage('data/sample.mp4', 'sync', null, null, null, null, null, null, null);
$response = req_upload('/data/sample.mp4', 'sync', null, null, null, null, null, null, null);
echo $response;
?>

C#

The following is a C#-based sample code for the API.

using System;
using System.Globalization;
using System.Net.Http;
using System.Net.Http.Headers;
using System.Text.RegularExpressions;
using System.Threading.Channels;
using System.Threading.Tasks;
using System.Text.Json;
using System.Text.Json.Serialization;
using System.Text;
using System.Diagnostics;

namespace HttpClientStatus
{
    public class ClovaSpeechRequest
    {
        public string language { get; set; }
        public string completion { get; set; }

		// Other parameters are returned. For the list of available parameters, see "Request recognition with Object Storage file URL (https://api.ncloud-docs.com/release-20250717/docs/ai-application-service-clovaspeech-longsentence/objectstorageurl)," "Request recognition with external URL (https://api.ncloud-docs.com/release-20250717/docs/ai-application-service-clovaspeech-longsentence/externalurl)," and "Request recognition after uploading local file (https://api.ncloud-docs.com/release-20250717/docs/ai-application-service-clovaspeech-longsentence/local)."
    }
    public class Program
    {
        private static readonly string secretKey = "";
        private static readonly string invokeUrl = "";
        public static async Task<string> Upload(ClovaSpeechRequest clovaSpeechRequest, string path)
        {

            using (var client = new HttpClient())
            {
                var multiForm = new MultipartFormDataContent();
                multiForm.Headers.Add("X-CLOVASPEECH-API-KEY", secretKey);
                multiForm.Add(new StringContent(JsonSerializer.Serialize(clovaSpeechRequest)), "params");
                FileStream fs = File.OpenRead(path);
                Console.WriteLine(Path.GetFileName(path));
                multiForm.Add(new StreamContent(fs), "media", Path.GetFileName(path));
                var message = await client.PostAsync(invokeUrl+ "/recognizer/upload", multiForm);
                return await message.Content.ReadAsStringAsync();
            }
        }

        static async Task Main(string[] args)
        {
            var clovaSpeechRequest = new ClovaSpeechRequest
            {
                language = "ko-KR",
                completion = "sync"
            };

            var result = await Upload(clovaSpeechRequest, @"D:\media\video\\sample.mp3");
            Console.WriteLine(result);
        }
    }
}