Textract NodeJS SDK By Example

Textract Start Job

const AWS = require('aws-sdk');

exports.handler = async (event) => {
    console.log(JSON.stringify(event));
    
    const TEXTRACT = new AWS.Textract();

    const bucketName = 'xxx';
    const objectKey = 'xxx';

    /*
        RoleArn = role to access SNS topic
        SNSTopicArn = SNS topic to notify completion of textract jobs, 
                      should be prefixed AmazonTextract- so Textract 
                      automatically gain access with default service 
                      role policy
    */
    
    const params = { 
        DocumentLocation: {
            S3Object: {
                Bucket: bucketName,
                Name: objectKey
            }
        },
        FeatureTypes: [
            'TABLES', 'FORMS'
        ],
        NotificationChannel: {
            RoleArn: 'arn:aws:iam::1234567890:role/textract-sns',
            SNSTopicArn: 'arn:aws:sns:us-east-1:1234567890:AmazonTextract-form-jobs'
        }
    };

    console.log(params);
    
    const response = {
        statusCode: 500,
        body: JSON.stringify('ServerError'),
    };
    
    try {
        const result = await TEXTRACT.startDocumentAnalysis(params)
                                     .promise();

        console.log(result);
        
        /*
            startDocumentAnalysis returns immediately with a job ID, a
            notification will be sent to SNS topic
        */

        response.statusCode = 200;
        response.body = JSON.stringify({ jobId: result['JobId'] });
    } catch (e) {
        response.statusCode = 400;
        response.body = JSON.stringify(e);
    }
    
    console.log(response);

    return response;
};

Textract Get Job

const AWS = require('aws-sdk');

exports.handler = async (event) => {
    console.log(JSON.stringify(event));
    
    /*
{
    "Records": [
        {
            "EventSource": "aws:sns",
            "EventVersion": "1.0",
            "EventSubscriptionArn": "arn:aws:sns:us-east-1:1234567890:AmazonTextract-topic-name:topic-id-xxxxxxxx",
            "Sns": {
                "Type": "Notification",
                "MessageId": "xxxxxx",
                "TopicArn": "arn:aws:sns:us-east-1:1234567890:AmazonTextract-topic-name",
                "Subject": null,
                "Message": "{\"JobId\":\"xxxxxxx\",\"Status\":\"SUCCEEDED\",\"API\":\"StartDocumentAnalysis\",\"Timestamp\":1593748510000,\"DocumentLocation\":{\"S3ObjectName\":\"filename.pdf\",\"S3Bucket\":\"textract-xxxx-raw\"}}",
                "Timestamp": "2020-07-31T00:55:10.995Z",
                "SignatureVersion": "1",
                "Signature": "xxxxxx",
                "SigningCertUrl": "xxxxxx",
                "UnsubscribeUrl": "xxxxx",
                "MessageAttributes": {}
            }
        }
    ]
}
    */

    const job = JSON.parse(event['Records'][0]['Sns']['Message']);
    
    const TEXTRACT = new AWS.Textract(); 
    
    const textractResult = await TEXTRACT.getDocumentAnalysis({
        'JobId': job['JobId']
    }).promise();
    
    console.log(textractResult);
    
    const S3 = new AWS.S3();
    
    const objectKey = job['DocumentLocation']['S3ObjectName'] + '.json';
    const s3Result = await S3.putObject({
        Body: JSON.stringify(textractResult),
        Bucket: 'textract-target',
        Key: objectKey
    }).promise();
    
    console.log(s3Result);
    
    const response = {
        statusCode: 200,
        body: JSON.stringify('Hello from Lambda!'),
    };
    
    return response;
};