diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..cf92f877e3cc2ba94b2736c8befdd04750627e32 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,40 @@ +# Use an official Python runtime as a parent image +FROM ubuntu:22.10 + +# Install system dependencies +RUN apt-get update && apt-get -y install python3 + +# Install python dependencies +RUN apt-get install -y python3-pip +RUN apt-get install -y build-essential pkg-config +RUN python3 -m pip install --upgrade pip + +# Install OpenJDK 17 +RUN apt-get install -y openjdk-17-jdk + +# Set Python 3 as the default version +RUN ln -s /usr/bin/python3 /usr/bin/python + +WORKDIR /app + +# Copy the project files +COPY . . + +# Change the requirements file path +COPY metadata-extractor/src/main/resources/requirements.txt . + +# Install Python dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Grant executable permissions to the Gradle wrapper +RUN chmod +x gradlew + +# Build the Java application +RUN ./gradlew metadata-extractor:build + +EXPOSE 5000 + +# Set the command to run the application +CMD ["java", "-Dedc.fs.config=metadata-extractor/config.properties", "-jar", "metadata-extractor/build/libs/file-metadata-extractor.jar"] + +curl -X GET "http://localhost:3000/api/extract-metadata?assetPath=english-test.pdf&assetPid=https://doi.org/10.48550/arXiv.2302.12813&assetDataLink=https://arxiv.org/pdf/2302.12813.pdf \ No newline at end of file diff --git a/english-test.pdf b/english-test.pdf new file mode 100644 index 0000000000000000000000000000000000000000..3824386b4d387a8491c34b78a23ad906f6dd8171 Binary files /dev/null and b/english-test.pdf differ diff --git a/metadata-extractor/src/main/resources/requirement.txt b/metadata-extractor/src/main/resources/requirement.txt deleted file mode 100644 index 78b42cf56fb3f1821009722f4c47e8e4a6c613d7..0000000000000000000000000000000000000000 --- a/metadata-extractor/src/main/resources/requirement.txt +++ /dev/null @@ -1,6 +0,0 @@ -pip install langid==1.1.6 -pip install nltk==3.8.1 -pip install transformers==4.27.4 -pip install pdf2docx==0.5.6 -pip install docx==0.2.4 -pip install yake==0.4.8 \ No newline at end of file diff --git a/metadata-extractor/src/main/resources/requirements.txt b/metadata-extractor/src/main/resources/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..696f7ee688b5aa8304ea67b7b8eae1d3f3fb3d55 --- /dev/null +++ b/metadata-extractor/src/main/resources/requirements.txt @@ -0,0 +1,8 @@ +langid==1.1.6 +nltk==3.8.1 +transformers==4.27.4 +pdf2docx==0.5.6 +docx==0.2.4 +yake==0.4.8 +tokenizers==0.13.2 +torch==2.0.0 diff --git a/test-transfers_.pdf b/test-transfers_.pdf new file mode 100644 index 0000000000000000000000000000000000000000..c075ff9fdf4f109dee0b1302ad3e0d502b3aae04 Binary files /dev/null and b/test-transfers_.pdf differ