8
n8n 中文网amn8n.com

研究论文爬虫到Google Sheets

中级

这是一个AI领域的自动化工作流,包含 12 个节点。主要使用 Set, Code, Html, HttpRequest, GoogleSheets 等节点,结合人工智能技术实现智能自动化。 使用Bright Data和n8n自动化研究论文收集

前置要求
  • 可能需要目标 API 的认证凭证
  • Google Sheets API 凭证
工作流预览
可视化展示节点连接关系,支持缩放和平移
导出工作流
复制以下 JSON 配置到 n8n 导入,即可使用此工作流
{
  "id": "giq3zqaP4QbY6LgC",
  "meta": {
    "instanceId": "60046904b104f0f72b2629a9d88fe9f676be4035769f1f08dad1dd38a76b9480"
  },
  "name": "Research_Paper_Scraper_to_Google_Sheets",
  "tags": [],
  "nodes": [
    {
      "id": "7d81edf3-6f00-4634-b79f-dbda3f9958e5",
      "name": "Start Scraping (Manual Trigger)",
      "type": "n8n-nodes-base.manualTrigger",
      "position": [
        -1080,
        580
      ],
      "parameters": {},
      "typeVersion": 1
    },
    {
      "id": "6e172db5-7483-4079-bf8a-785602526bdc",
      "name": "Set Research topic",
      "type": "n8n-nodes-base.set",
      "position": [
        -860,
        580
      ],
      "parameters": {
        "options": {},
        "assignments": {
          "assignments": [
            {
              "id": "b530a847-0bb2-4039-9ad0-cbc9cc4d909e",
              "name": "Topic",
              "type": "string",
              "value": "machine+learning"
            }
          ]
        }
      },
      "typeVersion": 3.4
    },
    {
      "id": "e65d092a-6854-478c-b33e-2fc309f71ae8",
      "name": "Send Request to Bright Data API",
      "type": "n8n-nodes-base.httpRequest",
      "position": [
        -600,
        580
      ],
      "parameters": {
        "url": "https://api.brightdata.com/request",
        "method": "POST",
        "options": {},
        "sendBody": true,
        "sendHeaders": true,
        "bodyParameters": {
          "parameters": [
            {
              "name": "zone",
              "value": "n8n_unblocker"
            },
            {
              "name": "url",
              "value": "=https://scholar.google.com/scholar?q={{ $json.Topic }}"
            },
            {
              "name": "country",
              "value": "us"
            },
            {
              "name": "format",
              "value": "raw"
            }
          ]
        },
        "headerParameters": {
          "parameters": [
            {
              "name": "Authorization",
              "value": "Bearer 40127ac3c2b4861572c8ad4c6d2273a0ce0472cb3ea7d3ac85a74a34629067aa"
            }
          ]
        }
      },
      "typeVersion": 4.2
    },
    {
      "id": "211bae33-32c5-44e8-b306-a5e0d520a4a0",
      "name": "Extract Data from HTML (Title, Author, etc.)",
      "type": "n8n-nodes-base.html",
      "position": [
        -400,
        580
      ],
      "parameters": {
        "options": {},
        "operation": "extractHtmlContent",
        "extractionValues": {
          "values": [
            {
              "key": "Title",
              "cssSelector": "h3.gs_rt, a.gs_rt",
              "returnArray": true
            },
            {
              "key": "Author",
              "cssSelector": "div.gs_a",
              "returnArray": true
            },
            {
              "key": "Abstract",
              "cssSelector": "div.gs_rs",
              "returnArray": true
            },
            {
              "key": "PDF Link\t",
              "cssSelector": "a[href*='pdf']",
              "returnArray": true,
              "returnValue": "attribute"
            }
          ]
        }
      },
      "typeVersion": 1.2
    },
    {
      "id": "9ab7ba20-8614-46c5-b57a-3749d6ae04c4",
      "name": "Clean & Structure Extracted Data",
      "type": "n8n-nodes-base.code",
      "position": [
        -200,
        580
      ],
      "parameters": {
        "jsCode": "const titles = items[0].json.Title || [];\nconst authors = items[0].json.Author || [];\nconst abstracts = items[0].json.Abstract || [];\nconst pdfLinks = items[0].json[\"PDF Link\\t\"] || [];\n\nconst output = [];\n\nfor (let i = 0; i < titles.length; i++) {\n  // Clean title (remove tags like [PDF][B])\n  let title = titles[i].replace(/\\[.*?\\]/g, '').trim();\n\n  // Clean author (remove any trailing dashes or HTML leftovers)\n  let author = authors[i] ? authors[i].replace(/\\s*-\\s*.*/, '').trim() : '';\n\n  // Abstract fallback\n  let abstract = abstracts[i] || '';\n\n  // Get PDF link — from either a single object or array of duplicates\n  let linkObj = pdfLinks[i];\n  let pdfLink = '';\n\n  if (Array.isArray(linkObj)) {\n    // If multiple objects per item\n    pdfLink = linkObj.find(obj => obj.href)?.href || '';\n  } else if (linkObj?.href) {\n    pdfLink = linkObj.href;\n  }\n\n  // Push cleaned object\n  output.push({\n    json: {\n      title,\n      author,\n      abstract,\n      pdfLink\n    }\n  });\n}\n\nreturn output;\n"
      },
      "typeVersion": 2
    },
    {
      "id": "a246f20c-2bb9-4319-8812-e296c87a7df0",
      "name": "Save Results to Google Sheet",
      "type": "n8n-nodes-base.googleSheets",
      "position": [
        120,
        580
      ],
      "parameters": {
        "columns": {
          "value": {
            "Topic": "={{ $('Set Research topic').item.json.Topic }}",
            "title": "={{ $json.title }}",
            "author": "={{ $json.author }}",
            "abstract": "={{ $json.abstract }}",
            "pdf link": "={{ $json.pdfLink }}"
          },
          "schema": [
            {
              "id": "Topic",
              "type": "string",
              "display": true,
              "required": false,
              "displayName": "Topic",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "title",
              "type": "string",
              "display": true,
              "required": false,
              "displayName": "title",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "author",
              "type": "string",
              "display": true,
              "required": false,
              "displayName": "author",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "abstract",
              "type": "string",
              "display": true,
              "required": false,
              "displayName": "abstract",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "pdf link",
              "type": "string",
              "display": true,
              "required": false,
              "displayName": "pdf link",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            }
          ],
          "mappingMode": "defineBelow",
          "matchingColumns": [],
          "attemptToConvertTypes": false,
          "convertFieldsToString": false
        },
        "options": {},
        "operation": "append",
        "sheetName": {
          "__rl": true,
          "mode": "list",
          "value": "gid=0",
          "cachedResultUrl": "https://docs.google.com/spreadsheets/d/1sOfCFsvHS9-BeE_PQ6J_jtQofCRcOv02XS7hrmFmpxQ/edit#gid=0",
          "cachedResultName": "Sheet1"
        },
        "documentId": {
          "__rl": true,
          "mode": "list",
          "value": "1sOfCFsvHS9-BeE_PQ6J_jtQofCRcOv02XS7hrmFmpxQ",
          "cachedResultUrl": "https://docs.google.com/spreadsheets/d/1sOfCFsvHS9-BeE_PQ6J_jtQofCRcOv02XS7hrmFmpxQ/edit?usp=drivesdk",
          "cachedResultName": "Research papers from Google Scholar"
        }
      },
      "credentials": {
        "googleSheetsOAuth2Api": {
          "id": "r2mDaisH6e9VkwHl",
          "name": "Google Sheets account"
        }
      },
      "typeVersion": 4.6
    },
    {
      "id": "1b4a1504-4a4a-4a0d-892b-d0c3e205ed85",
      "name": "Sticky Note",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -1140,
        60
      ],
      "parameters": {
        "color": 5,
        "width": 420,
        "height": 720,
        "content": "## 🔹 **Section 1: User Input & Trigger**\n\n**🧩 Nodes: Start Scraping | Set Topic**\n📍 **Purpose:** Let users easily input the topic they want to scrape — no need to deal with complex URLs.\n\n| 🧱 Node   | ✅ New Name                   | 💡 Description                                                                                                                                                                         |\n| --------- | ---------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |\n| ⚡ Trigger | **Start Scraping (Manual)**  | This node starts the workflow when you click “Execute Workflow.” It's the entry point.                                                                                                 |\n| ✏️ Set    | **Set Topic (Manual Input)** | Instead of requiring a URL, the user will enter a topic (like \"machine learning\" or \"digital marketing\"). This topic will be used to automatically generate the URL behind the scenes. |\n\n### 🧠 How it helps:\n\n* Great for beginners: Just type the topic, hit run.\n* Keeps the interface clean and user-friendly.\n* Avoids confusion around URLs and formats.\n\n---\n\n"
      },
      "typeVersion": 1
    },
    {
      "id": "bc56f528-6d18-4e05-942f-c06bb6e10b27",
      "name": "Sticky Note1",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -660,
        80
      ],
      "parameters": {
        "color": 6,
        "width": 600,
        "height": 700,
        "content": "## 🔸 **Section 2: Scrape & Parse Website**\n\n**🧩 Nodes: Send Request | Extract HTML | Clean Data**\n📍 **Purpose:** Uses the Bright Data proxy to access the webpage, extract raw HTML content, and clean it up into a readable format (title, author, abstract, etc.).\n\n| 🧱 Node         | ✅ New Name                            | 💡 Description                                                                                                                                                        |\n| --------------- | ------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |\n| 🌐 HTTP Request | **Send Topic Request to Bright Data** | This sends a request to the Bright Data API using the topic you set earlier. It uses Bright Data’s network to safely load the actual website and return HTML content. |\n| 🧱 HTML Extract | **Extract Data from Webpage**         | Parses the returned HTML to find relevant data like titles, authors, abstracts, and links.                                                                            |\n| 🔣 Code         | **Clean and Format Scraped Data**     | A custom code block that organizes the messy data into neat records. For example: title → column A, abstract → column B, etc.                                         |\n\n### 🧠 How it helps:\n\n* Makes web scraping safe and reliable by using proxies.\n* Converts unreadable HTML into structured information.\n* Beginner-friendly: No need to write a parser yourself.\n\n---\n\n"
      },
      "typeVersion": 1
    },
    {
      "id": "2c54e5e6-011a-4562-98ac-9cc9834bc284",
      "name": "Sticky Note2",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        0,
        0
      ],
      "parameters": {
        "color": 3,
        "width": 340,
        "height": 780,
        "content": "## 🟢 **Section 3: Save to Google Sheets**\n\n**🧩 Node: Append to Google Sheets**\n📍 **Purpose:** Automatically sends the clean data into a Google Sheet for easy access, filtering, or sharing.\n\n| 🧱 Node          | ✅ New Name                            | 💡 Description                                                                                                                      |\n| ---------------- | ------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |\n| 📄 Google Sheets | **Store Scraped Data in Spreadsheet** | Takes the structured output and appends it to the connected Google Sheet. Each result gets a row with title, author, abstract, etc. |\n\n### 🧠 How it helps:\n\n* No manual copy-pasting ever again!\n* Shareable and searchable format.\n* Updates automatically as you scrape more topics.\n\n---\n\n"
      },
      "typeVersion": 1
    },
    {
      "id": "4ce90703-961e-4070-9356-c9dffc23a6c5",
      "name": "Sticky Note9",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -2980,
        80
      ],
      "parameters": {
        "color": 4,
        "width": 1300,
        "height": 320,
        "content": "=======================================\n            WORKFLOW ASSISTANCE\n=======================================\nFor any questions or support, please contact:\n    Yaron@nofluff.online\n\nExplore more tips and tutorials here:\n   - YouTube: https://www.youtube.com/@YaronBeen/videos\n   - LinkedIn: https://www.linkedin.com/in/yaronbeen/\n=======================================\n"
      },
      "typeVersion": 1
    },
    {
      "id": "069ddb89-f7a1-4c4b-b65d-212be3252750",
      "name": "Sticky Note4",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -2980,
        420
      ],
      "parameters": {
        "color": 4,
        "width": 1289,
        "height": 1878,
        "content": "## 🌟 Research Paper Scraper to Google Sheets\n\n**Automate extraction of data from any website based on a topic — no coding needed!**\n\n---\n\n## 🔹 **Section 1: User Input & Trigger**\n\n**🧩 Nodes: Start Scraping | Set Topic**\n📍 **Purpose:** Let users easily input the topic they want to scrape — no need to deal with complex URLs.\n\n| 🧱 Node   | ✅ New Name                   | 💡 Description                                                                                                                                                                         |\n| --------- | ---------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |\n| ⚡ Trigger | **Start Scraping (Manual)**  | This node starts the workflow when you click “Execute Workflow.” It's the entry point.                                                                                                 |\n| ✏️ Set    | **Set Topic (Manual Input)** | Instead of requiring a URL, the user will enter a topic (like \"machine learning\" or \"digital marketing\"). This topic will be used to automatically generate the URL behind the scenes. |\n\n### 🧠 How it helps:\n\n* Great for beginners: Just type the topic, hit run.\n* Keeps the interface clean and user-friendly.\n* Avoids confusion around URLs and formats.\n\n---\n\n## 🔸 **Section 2: Scrape & Parse Website**\n\n**🧩 Nodes: Send Request | Extract HTML | Clean Data**\n📍 **Purpose:** Uses the Bright Data proxy to access the webpage, extract raw HTML content, and clean it up into a readable format (title, author, abstract, etc.).\n\n| 🧱 Node         | ✅ New Name                            | 💡 Description                                                                                                                                                        |\n| --------------- | ------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |\n| 🌐 HTTP Request | **Send Topic Request to Bright Data** | This sends a request to the Bright Data API using the topic you set earlier. It uses Bright Data’s network to safely load the actual website and return HTML content. |\n| 🧱 HTML Extract | **Extract Data from Webpage**         | Parses the returned HTML to find relevant data like titles, authors, abstracts, and links.                                                                            |\n| 🔣 Code         | **Clean and Format Scraped Data**     | A custom code block that organizes the messy data into neat records. For example: title → column A, abstract → column B, etc.                                         |\n\n### 🧠 How it helps:\n\n* Makes web scraping safe and reliable by using proxies.\n* Converts unreadable HTML into structured information.\n* Beginner-friendly: No need to write a parser yourself.\n\n---\n\n## 🟢 **Section 3: Save to Google Sheets**\n\n**🧩 Node: Append to Google Sheets**\n📍 **Purpose:** Automatically sends the clean data into a Google Sheet for easy access, filtering, or sharing.\n\n| 🧱 Node          | ✅ New Name                            | 💡 Description                                                                                                                      |\n| ---------------- | ------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |\n| 📄 Google Sheets | **Store Scraped Data in Spreadsheet** | Takes the structured output and appends it to the connected Google Sheet. Each result gets a row with title, author, abstract, etc. |\n\n### 🧠 How it helps:\n\n* No manual copy-pasting ever again!\n* Shareable and searchable format.\n* Updates automatically as you scrape more topics.\n\n---\n\n## ✅ What a Beginner Gains from This Workflow\n\n| 💡 Feature                  | 🚀 Benefit                                                                        |\n| --------------------------- | --------------------------------------------------------------------------------- |\n| Topic-based input           | You don’t need to find or understand complex URLs. Just type “AI” or “marketing.” |\n| Fully automated scraping    | You don’t need to open browsers or inspect elements.                              |\n| Ready-to-use Google Sheet   | The final data is clean and saved into a sheet you can use anywhere.              |\n| Beautiful, modular workflow | Each step is visual, editable, and reusable without coding skills.                |\n\n---\n\n## 🎯 Final Result:\n\nYou type a **topic** → Bright Data scrapes the web → It extracts content → Cleans it → Saves it into **Google Sheets**.\nEverything happens automatically. **No code. No hassle. Just data.**\n\n---\n\n"
      },
      "typeVersion": 1
    },
    {
      "id": "a1a5e609-756a-4757-a026-1349cf388e61",
      "name": "Sticky Note5",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        400,
        0
      ],
      "parameters": {
        "color": 7,
        "width": 380,
        "height": 240,
        "content": "## I’ll receive a tiny commission if you join Bright Data through this link—thanks for fueling more free content!\n\n### https://get.brightdata.com/1tndi4600b25"
      },
      "typeVersion": 1
    }
  ],
  "active": false,
  "pinData": {},
  "settings": {
    "executionOrder": "v1"
  },
  "versionId": "f931202a-3c22-495d-b775-71665bdf6c27",
  "connections": {
    "6e172db5-7483-4079-bf8a-785602526bdc": {
      "main": [
        [
          {
            "node": "e65d092a-6854-478c-b33e-2fc309f71ae8",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "e65d092a-6854-478c-b33e-2fc309f71ae8": {
      "main": [
        [
          {
            "node": "211bae33-32c5-44e8-b306-a5e0d520a4a0",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "7d81edf3-6f00-4634-b79f-dbda3f9958e5": {
      "main": [
        [
          {
            "node": "6e172db5-7483-4079-bf8a-785602526bdc",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "9ab7ba20-8614-46c5-b57a-3749d6ae04c4": {
      "main": [
        [
          {
            "node": "a246f20c-2bb9-4319-8812-e296c87a7df0",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "211bae33-32c5-44e8-b306-a5e0d520a4a0": {
      "main": [
        [
          {
            "node": "9ab7ba20-8614-46c5-b57a-3749d6ae04c4",
            "type": "main",
            "index": 0
          }
        ]
      ]
    }
  }
}
常见问题

如何使用这个工作流?

复制上方的 JSON 配置代码,在您的 n8n 实例中创建新工作流并选择「从 JSON 导入」,粘贴配置后根据需要修改凭证设置即可。

这个工作流适合什么场景?

中级 - 人工智能

需要付费吗?

本工作流完全免费,您可以直接导入使用。但请注意,工作流中使用的第三方服务(如 OpenAI API)可能需要您自行付费。

工作流信息
难度等级
中级
节点数量12
分类1
节点类型7
难度说明

适合有一定经验的用户,包含 6-15 个节点的中等复杂度工作流

作者
Yaron Been

Yaron Been

@yaron-nofluff

Building AI Agents and Automations | Growth Marketer | Entrepreneur | Book Author & Podcast Host If you need any help with Automations, feel free to reach out via linkedin: https://www.linkedin.com/in/yaronbeen/ And check out my Youtube channel: https://www.youtube.com/@YaronBeen/videos

外部链接
在 n8n.io 查看

分享此工作流

分类

分类: 34